From 8191d21fb1e6e6b31b89d3d486ed9e092277d217 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Fri, 28 Jun 2024 22:59:05 -0400 Subject: [PATCH 01/28] Exllamav2_filter --- outlines/integrations/exllamav2.py | 218 +++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 outlines/integrations/exllamav2.py diff --git a/outlines/integrations/exllamav2.py b/outlines/integrations/exllamav2.py new file mode 100644 index 000000000..7434389e8 --- /dev/null +++ b/outlines/integrations/exllamav2.py @@ -0,0 +1,218 @@ +"""Make Hugging Face transformers compatible with Outlines' structured generation. + + _______________________________ +/ Don't want to self-host? \ +\\ Try .json at http://dottxt.co / + ------------------------------- + \\ ^__^ + \\ (oo)\\_______ + (__)\\ )\\/\ + ||----w | + || || + +Copyright 2024- the Outlines developers + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from collections import defaultdict +from typing import DefaultDict, Iterable, Optional, Type, Union + +import torch +from pydantic import BaseModel +from exllamav2 import ExLlamaV2 +from transformers import Pipeline, PreTrainedTokenizerBase + + +from outlines.fsm.guide import RegexGuide, StopAtEOSGuide, Guide +from outlines.fsm.json_schema import build_regex_from_schema +from outlines.integrations.utils import adapt_tokenizer, convert_json_schema_to_str +from outlines.generate.generator import is_generation_finished + +import copy + +class FSMFilter: + """Bias transformers generation based on a fsm. + + Attributes + ---------- + fsm + The finite state machine which is used to bias the logits. + """ + token_sequence: list[int] + seq_id: int + + def __init__( + self, + fsm: Guide + ): + """Compile the FSM that drives generation. + + Parameters + ---------- + fsm + The fsm of the model. + """ + self.fsm = fsm + self._fsm_state: DefaultDict[int, int] = defaultdict(int) + self.token_sequence = [] + + def begin(self, prefix_str: str = "") -> None: + self._fsm_state = defaultdict(int) + self.seq_id = hash(tuple([])) + def feed(self, token: torch.Tensor) -> None: + int_token = int(token[0][0].numpy()) + + last_seq_id = self.seq_id + self.token_sequence.append(int_token) + self.seq_id = hash(tuple(self.token_sequence)) + self._fsm_state[self.seq_id] = self.fsm.get_next_state( + state=self._fsm_state[last_seq_id], token_id=int_token + ) + + def clone(self): + return copy.deepcopy(self) + def next(self) -> tuple[set[int], set[int]]: + allowed_tokens = self.fsm.get_next_instruction( + state=self._fsm_state[self.seq_id] + ).tokens + end_tokens = [] + for token in allowed_tokens: + next_state = self.fsm.get_next_state( + state=self._fsm_state[self.seq_id], token_id=token + ) + if is_generation_finished([self.fsm], [next_state]): + end_tokens.append(token) + return set(allowed_tokens), set(end_tokens) + +class RegexFilter(FSMFilter): + """Bias transformers generation based on a regular expression. + + Attributes + ---------- + fsm + The finite state machine which is used to bias the logits. + """ + def __init__( + self, + regex_string: str, + tokenizer: PreTrainedTokenizerBase, + ): + """Compile the FSM that drives the regex-structured generation. + + Parameters + ---------- + tokenizer + The tokenizer of the model. + + Raises + ------ + ValueError + If the `tokenizer` parameter is not a tokenizer. + """ + assert isinstance(tokenizer, PreTrainedTokenizerBase) + tokenizer = adapt_tokenizer(tokenizer=tokenizer) + fsm = RegexGuide(regex_string=regex_string, tokenizer=tokenizer) + super().__init__(fsm) + +class TextFilter: + """Bias transformers generation based on a stop at eos text expression. + + Attributes + ---------- + fsm + The finite state machine which is used to bias the logits. + """ + def __init__( + self, + tokenizer: PreTrainedTokenizerBase, + ): + """Compile the FSM that drives text generation. + + Parameters + ---------- + tokenizer + The tokenizer of the model. + + Raises + ------ + ValueError + If the `tokenizer` parameter is not a tokenizer. + """ + assert isinstance(tokenizer, PreTrainedTokenizerBase) + tokenizer = adapt_tokenizer(tokenizer=tokenizer) + fsm = StopAtEOSGuide(tokenizer=tokenizer) + super().__init__(fsm) + +class JSONFilter(RegexFilter): + """Bias exllamav2 generation based on a JSON schema. + + Attributes + ---------- + fsm + The finite state machine which is used to bias the logits. + """ + + def __init__( + self, + schema: Union[dict, Type[BaseModel], str], + tokenizer: PreTrainedTokenizerBase, + whitespace_pattern: Optional[str] = None, + ): + """Compile the FSM that drives the JSON-guided generation. + + Parameters + ---------- + schema + A schema that encodes the structure we want the model to generate. + tokenizer + The tokenizer of the model. + whitespace_pattern + Pattern to use for JSON syntactic whitespace (doesn't impact string + literals). For example, to allow only a single space or newline with + `whitespace_pattern=r"[\n ]?"` + """ + schema_str = convert_json_schema_to_str(json_schema=schema) + regex_string = build_regex_from_schema(schema_str, whitespace_pattern) + super().__init__(regex_string=regex_string, tokenizer=tokenizer) + +class ChoiceFilter(RegexFilter): + """Bias exllamav2 generation based on choices. + + Attributes + ---------- + fsm + The finite state machine which is used to bias the logits. + """ + + def __init__( + self, + choices: list[str], + tokenizer: PreTrainedTokenizerBase, + ): + """Compile the FSM that drives the JSON-guided generation. + + Parameters + ---------- + schema + A schema that encodes the structure we want the model to generate. + tokenizer + The tokenizer of the model. + whitespace_pattern + Pattern to use for JSON syntactic whitespace (doesn't impact string + literals). For example, to allow only a single space or newline with + `whitespace_pattern=r"[\n ]?"` + """ + regex_string = r"(" + r"|".join(choices) + r")" + super().__init__(regex_string=regex_string, tokenizer=tokenizer) + From 42978c4cd2ac10dce247774493e3a5881db8bfca Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Fri, 28 Jun 2024 23:10:48 -0400 Subject: [PATCH 02/28] Fix comment --- outlines/integrations/exllamav2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/outlines/integrations/exllamav2.py b/outlines/integrations/exllamav2.py index 7434389e8..737a0ff58 100644 --- a/outlines/integrations/exllamav2.py +++ b/outlines/integrations/exllamav2.py @@ -1,4 +1,4 @@ -"""Make Hugging Face transformers compatible with Outlines' structured generation. +"""Make ExllamaV2 compatible with Outlines' structured generation. _______________________________ / Don't want to self-host? \ From d271fff3052d2a52e2adc90069cc26e9666440f0 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Fri, 28 Jun 2024 23:28:25 -0400 Subject: [PATCH 03/28] Fixed precommit issues --- outlines/integrations/exllamav2.py | 31 +++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/outlines/integrations/exllamav2.py b/outlines/integrations/exllamav2.py index 737a0ff58..ceb6a1e8e 100644 --- a/outlines/integrations/exllamav2.py +++ b/outlines/integrations/exllamav2.py @@ -25,21 +25,19 @@ limitations under the License. """ +import copy from collections import defaultdict -from typing import DefaultDict, Iterable, Optional, Type, Union +from typing import DefaultDict, Optional, Type, Union import torch from pydantic import BaseModel -from exllamav2 import ExLlamaV2 -from transformers import Pipeline, PreTrainedTokenizerBase - +from transformers import PreTrainedTokenizerBase -from outlines.fsm.guide import RegexGuide, StopAtEOSGuide, Guide +from outlines.fsm.guide import Guide, RegexGuide, StopAtEOSGuide from outlines.fsm.json_schema import build_regex_from_schema -from outlines.integrations.utils import adapt_tokenizer, convert_json_schema_to_str from outlines.generate.generator import is_generation_finished +from outlines.integrations.utils import adapt_tokenizer, convert_json_schema_to_str -import copy class FSMFilter: """Bias transformers generation based on a fsm. @@ -49,13 +47,11 @@ class FSMFilter: fsm The finite state machine which is used to bias the logits. """ + token_sequence: list[int] seq_id: int - def __init__( - self, - fsm: Guide - ): + def __init__(self, fsm: Guide): """Compile the FSM that drives generation. Parameters @@ -70,6 +66,7 @@ def __init__( def begin(self, prefix_str: str = "") -> None: self._fsm_state = defaultdict(int) self.seq_id = hash(tuple([])) + def feed(self, token: torch.Tensor) -> None: int_token = int(token[0][0].numpy()) @@ -82,10 +79,13 @@ def feed(self, token: torch.Tensor) -> None: def clone(self): return copy.deepcopy(self) + def next(self) -> tuple[set[int], set[int]]: allowed_tokens = self.fsm.get_next_instruction( state=self._fsm_state[self.seq_id] ).tokens + if allowed_tokens is None: + allowed_tokens = [] end_tokens = [] for token in allowed_tokens: next_state = self.fsm.get_next_state( @@ -95,6 +95,7 @@ def next(self) -> tuple[set[int], set[int]]: end_tokens.append(token) return set(allowed_tokens), set(end_tokens) + class RegexFilter(FSMFilter): """Bias transformers generation based on a regular expression. @@ -103,6 +104,7 @@ class RegexFilter(FSMFilter): fsm The finite state machine which is used to bias the logits. """ + def __init__( self, regex_string: str, @@ -125,7 +127,8 @@ def __init__( fsm = RegexGuide(regex_string=regex_string, tokenizer=tokenizer) super().__init__(fsm) -class TextFilter: + +class TextFilter(FSMFilter): """Bias transformers generation based on a stop at eos text expression. Attributes @@ -133,6 +136,7 @@ class TextFilter: fsm The finite state machine which is used to bias the logits. """ + def __init__( self, tokenizer: PreTrainedTokenizerBase, @@ -154,6 +158,7 @@ def __init__( fsm = StopAtEOSGuide(tokenizer=tokenizer) super().__init__(fsm) + class JSONFilter(RegexFilter): """Bias exllamav2 generation based on a JSON schema. @@ -186,6 +191,7 @@ def __init__( regex_string = build_regex_from_schema(schema_str, whitespace_pattern) super().__init__(regex_string=regex_string, tokenizer=tokenizer) + class ChoiceFilter(RegexFilter): """Bias exllamav2 generation based on choices. @@ -215,4 +221,3 @@ def __init__( """ regex_string = r"(" + r"|".join(choices) + r")" super().__init__(regex_string=regex_string, tokenizer=tokenizer) - From 1bdcd4e6da28b67a9334d511ac93f8b7b61b9502 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Wed, 10 Jul 2024 12:48:41 -0400 Subject: [PATCH 04/28] Removed text --- outlines/integrations/exllamav2.py | 33 +----------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/outlines/integrations/exllamav2.py b/outlines/integrations/exllamav2.py index ceb6a1e8e..1f9a3a159 100644 --- a/outlines/integrations/exllamav2.py +++ b/outlines/integrations/exllamav2.py @@ -33,7 +33,7 @@ from pydantic import BaseModel from transformers import PreTrainedTokenizerBase -from outlines.fsm.guide import Guide, RegexGuide, StopAtEOSGuide +from outlines.fsm.guide import Guide, RegexGuide from outlines.fsm.json_schema import build_regex_from_schema from outlines.generate.generator import is_generation_finished from outlines.integrations.utils import adapt_tokenizer, convert_json_schema_to_str @@ -128,37 +128,6 @@ def __init__( super().__init__(fsm) -class TextFilter(FSMFilter): - """Bias transformers generation based on a stop at eos text expression. - - Attributes - ---------- - fsm - The finite state machine which is used to bias the logits. - """ - - def __init__( - self, - tokenizer: PreTrainedTokenizerBase, - ): - """Compile the FSM that drives text generation. - - Parameters - ---------- - tokenizer - The tokenizer of the model. - - Raises - ------ - ValueError - If the `tokenizer` parameter is not a tokenizer. - """ - assert isinstance(tokenizer, PreTrainedTokenizerBase) - tokenizer = adapt_tokenizer(tokenizer=tokenizer) - fsm = StopAtEOSGuide(tokenizer=tokenizer) - super().__init__(fsm) - - class JSONFilter(RegexFilter): """Bias exllamav2 generation based on a JSON schema. From 37d2471b4f361dfd8c0b4f1aec5fcfa66c82a71d Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Thu, 1 Aug 2024 15:12:36 -0400 Subject: [PATCH 05/28] Basic draft done --- outlines/generate/fsm.py | 10 - outlines/generate/regex.py | 14 -- outlines/generate/text.py | 7 - outlines/integrations/exllamav2.py | 192 ----------------- outlines/models/exllamav2.py | 321 ++++++++++++++++------------- tests/generate/test_generate.py | 8 + 6 files changed, 180 insertions(+), 372 deletions(-) delete mode 100644 outlines/integrations/exllamav2.py diff --git a/outlines/generate/fsm.py b/outlines/generate/fsm.py index 03fe512b9..4fcc625e5 100644 --- a/outlines/generate/fsm.py +++ b/outlines/generate/fsm.py @@ -30,13 +30,3 @@ def fsm_vision(model, fsm: interegular.fsm.FSM, sampler: Sampler = multinomial() fsm = RegexGuide.from_interegular_fsm(fsm, model.tokenizer) logits_processor = FSMLogitsProcessor(tokenizer=model.tokenizer, fsm=fsm) return VisionSequenceGeneratorAdapter(model, logits_processor, sampler) - - -@fsm.register(ExLlamaV2Model) -def fsm_exllamav2( - model, fsm: interegular.fsm.FSM, sampler: Sampler = multinomial() -) -> SequenceGenerator: - fsm = RegexGuide.from_interegular_fsm(fsm, model.tokenizer) - device = model.device - generator = SequenceGenerator(fsm, model, sampler, device) - return generator diff --git a/outlines/generate/regex.py b/outlines/generate/regex.py index 815a8b1b9..a2270c7d4 100644 --- a/outlines/generate/regex.py +++ b/outlines/generate/regex.py @@ -49,20 +49,6 @@ def regex_vision( return VisionSequenceGeneratorAdapter(model, logits_processor, sampler) -@regex.register(ExLlamaV2Model) -def regex_exllamav2( - model, - regex_str: str, - sampler: Sampler = multinomial(), -) -> SequenceGenerator: - fsm = RegexGuide(regex_str, model.tokenizer) - - device = model.device - generator = SequenceGenerator(fsm, model, sampler, device) - - return generator - - @regex.register(OpenAI) def regex_openai( model: OpenAI, diff --git a/outlines/generate/text.py b/outlines/generate/text.py index 3fe3dc553..fad9ea7a3 100644 --- a/outlines/generate/text.py +++ b/outlines/generate/text.py @@ -36,13 +36,6 @@ def text(model, sampler: Sampler = multinomial()) -> SequenceGeneratorAdapter: return SequenceGeneratorAdapter(model, None, sampler) -@text.register(ExLlamaV2Model) -def text_exllamav2(model, sampler: Sampler = multinomial()) -> SequenceGenerator: - fsm = StopAtEOSGuide(model.tokenizer) - device = model.device - return SequenceGenerator(fsm, model, sampler, device) - - @text.register(TransformersVision) def text_vision(model, sampler: Sampler = multinomial()): return VisionSequenceGeneratorAdapter(model, None, sampler) diff --git a/outlines/integrations/exllamav2.py b/outlines/integrations/exllamav2.py deleted file mode 100644 index 1f9a3a159..000000000 --- a/outlines/integrations/exllamav2.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Make ExllamaV2 compatible with Outlines' structured generation. - - _______________________________ -/ Don't want to self-host? \ -\\ Try .json at http://dottxt.co / - ------------------------------- - \\ ^__^ - \\ (oo)\\_______ - (__)\\ )\\/\ - ||----w | - || || - -Copyright 2024- the Outlines developers - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -import copy -from collections import defaultdict -from typing import DefaultDict, Optional, Type, Union - -import torch -from pydantic import BaseModel -from transformers import PreTrainedTokenizerBase - -from outlines.fsm.guide import Guide, RegexGuide -from outlines.fsm.json_schema import build_regex_from_schema -from outlines.generate.generator import is_generation_finished -from outlines.integrations.utils import adapt_tokenizer, convert_json_schema_to_str - - -class FSMFilter: - """Bias transformers generation based on a fsm. - - Attributes - ---------- - fsm - The finite state machine which is used to bias the logits. - """ - - token_sequence: list[int] - seq_id: int - - def __init__(self, fsm: Guide): - """Compile the FSM that drives generation. - - Parameters - ---------- - fsm - The fsm of the model. - """ - self.fsm = fsm - self._fsm_state: DefaultDict[int, int] = defaultdict(int) - self.token_sequence = [] - - def begin(self, prefix_str: str = "") -> None: - self._fsm_state = defaultdict(int) - self.seq_id = hash(tuple([])) - - def feed(self, token: torch.Tensor) -> None: - int_token = int(token[0][0].numpy()) - - last_seq_id = self.seq_id - self.token_sequence.append(int_token) - self.seq_id = hash(tuple(self.token_sequence)) - self._fsm_state[self.seq_id] = self.fsm.get_next_state( - state=self._fsm_state[last_seq_id], token_id=int_token - ) - - def clone(self): - return copy.deepcopy(self) - - def next(self) -> tuple[set[int], set[int]]: - allowed_tokens = self.fsm.get_next_instruction( - state=self._fsm_state[self.seq_id] - ).tokens - if allowed_tokens is None: - allowed_tokens = [] - end_tokens = [] - for token in allowed_tokens: - next_state = self.fsm.get_next_state( - state=self._fsm_state[self.seq_id], token_id=token - ) - if is_generation_finished([self.fsm], [next_state]): - end_tokens.append(token) - return set(allowed_tokens), set(end_tokens) - - -class RegexFilter(FSMFilter): - """Bias transformers generation based on a regular expression. - - Attributes - ---------- - fsm - The finite state machine which is used to bias the logits. - """ - - def __init__( - self, - regex_string: str, - tokenizer: PreTrainedTokenizerBase, - ): - """Compile the FSM that drives the regex-structured generation. - - Parameters - ---------- - tokenizer - The tokenizer of the model. - - Raises - ------ - ValueError - If the `tokenizer` parameter is not a tokenizer. - """ - assert isinstance(tokenizer, PreTrainedTokenizerBase) - tokenizer = adapt_tokenizer(tokenizer=tokenizer) - fsm = RegexGuide(regex_string=regex_string, tokenizer=tokenizer) - super().__init__(fsm) - - -class JSONFilter(RegexFilter): - """Bias exllamav2 generation based on a JSON schema. - - Attributes - ---------- - fsm - The finite state machine which is used to bias the logits. - """ - - def __init__( - self, - schema: Union[dict, Type[BaseModel], str], - tokenizer: PreTrainedTokenizerBase, - whitespace_pattern: Optional[str] = None, - ): - """Compile the FSM that drives the JSON-guided generation. - - Parameters - ---------- - schema - A schema that encodes the structure we want the model to generate. - tokenizer - The tokenizer of the model. - whitespace_pattern - Pattern to use for JSON syntactic whitespace (doesn't impact string - literals). For example, to allow only a single space or newline with - `whitespace_pattern=r"[\n ]?"` - """ - schema_str = convert_json_schema_to_str(json_schema=schema) - regex_string = build_regex_from_schema(schema_str, whitespace_pattern) - super().__init__(regex_string=regex_string, tokenizer=tokenizer) - - -class ChoiceFilter(RegexFilter): - """Bias exllamav2 generation based on choices. - - Attributes - ---------- - fsm - The finite state machine which is used to bias the logits. - """ - - def __init__( - self, - choices: list[str], - tokenizer: PreTrainedTokenizerBase, - ): - """Compile the FSM that drives the JSON-guided generation. - - Parameters - ---------- - schema - A schema that encodes the structure we want the model to generate. - tokenizer - The tokenizer of the model. - whitespace_pattern - Pattern to use for JSON syntactic whitespace (doesn't impact string - literals). For example, to allow only a single space or newline with - `whitespace_pattern=r"[\n ]?"` - """ - regex_string = r"(" + r"|".join(choices) + r")" - super().__init__(regex_string=regex_string, tokenizer=tokenizer) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 0ec6ef033..369d63ad2 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -1,121 +1,152 @@ import os -from typing import TYPE_CHECKING, Optional +from typing import ( + TYPE_CHECKING, + Iterator, + List, + Optional, + TypedDict, + Union, +) + +from typing_extensions import Unpack + +from outlines.generate.api import GenerationParameters, SamplingParameters if TYPE_CHECKING: - from exllamav2 import ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Lora - from transformers import PreTrainedTokenizer - import torch + from exllamav2 import ExLlamaV2, ExLlamaV2Lora, ExLlamaV2Tokenizer + from exllamav2.generator import ExLlamaV2Sampler + from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler, ExLlamaV2DynamicJob -from .transformers import TransformerTokenizer +class ExllamaV2Params(TypedDict, total=False): + min_new_tokens: int = 0, + seed: Union[int, None] = None, + encode_special_tokens: bool = False, + decode_special_tokens: bool = False, + add_bos: bool = False, class ExLlamaV2Model: """Represents a `exl2` model.""" - def __init__( self, - model: "ExLlamaV2", - tokenizer: "PreTrainedTokenizer", - device, - cache: "ExLlamaV2Cache", - lora: Optional["ExLlamaV2Lora"] = None, + generator: ExLlamaV2DynamicGenerator, ): - self.device = device - self.model = model - self.tokenizer = TransformerTokenizer(tokenizer) - self.cache = cache - self.past_seq = None - self.lora = lora - - def forward(self, input_ids: "torch.LongTensor", *_): - """Compute a forward pass through the exl2 model.""" - import torch - - # Caching with past_seq - reset = True - seq_tensor = input_ids[0] - - if self.past_seq is not None: - min_length = min(self.past_seq.shape[0], seq_tensor.shape[0]) - indices = torch.nonzero( - ~torch.eq(self.past_seq[:min_length], seq_tensor[:min_length]) - ) - if len(indices) > 0: - longest_prefix = indices[0].item() - else: - longest_prefix = min_length - - if longest_prefix > 0: - reset = False - self.cache.current_seq_len = longest_prefix - if seq_tensor.shape[0] - longest_prefix > 1: - self.model.forward( - seq_tensor[longest_prefix:-1].view(1, -1), - self.cache, - preprocess_only=True, - loras=[self.lora], - ) - elif seq_tensor.shape[0] == longest_prefix: - self.cache.current_seq_len -= 1 - - if reset: - self.cache.current_seq_len = 0 - if seq_tensor.shape[0] > 1: - self.model.forward( - seq_tensor[:-1].view(1, -1), - self.cache, - preprocess_only=True, - loras=[self.lora], - ) - - self.past_seq = seq_tensor - - return self.model.forward( - seq_tensor[-1:].view(1, -1), self.cache, loras=[self.lora] + self.generator = generator + def generate( + self, + prompts: Union[str, List[str]], + generation_parameters: GenerationParameters, + structure_logits_processor, + sampling_parameters: SamplingParameters, + **exllamav2_params: Unpack[ExllamaV2Params] + ) -> List[str]: + stop_conditions = [self.generator.tokenizer.eos_token_id] + if isinstance(generation_parameters.stop_at, str): + stop_conditions.append(generation_parameters.stop_at) + elif isinstance(generation_parameters.stop_at, list): + for stop_at in generation_parameters.stop_at: + stop_conditions.append(stop_at) + gen_settings = ExLlamaV2Sampler.Settings() + if sampling_parameters.temperature is not None: + gen_settings.temperature = sampling_parameters.temperature + if sampling_parameters.top_p is not None: + gen_settings.top_p = sampling_parameters.top_p + if sampling_parameters.top_k is not None: + gen_settings.top_k = sampling_parameters.top_k + gen_settings.logits_processor = structure_logits_processor + return self.generator.generate( + prompt=prompts, + gen_settings=gen_settings, + max_new_tokens=generation_parameters.max_tokens, + completion_only=True, + encode_special_tokens=exllamav2_params.encode_special_tokens, + stop_conditions=stop_conditions, + add_bos=exllamav2_params.add_bos, + seed=generation_parameters.seed, ) + def stream( + self, + prompts: Union[str, List[str]], + generation_parameters: GenerationParameters, + structure_logits_processor, + sampling_parameters: SamplingParameters, + **exllamav2_params: Unpack[ExllamaV2Params] + ) -> List[Iterator[str]]: + gen_settings = ExLlamaV2Sampler.Settings() + if sampling_parameters.temperature is not None: + gen_settings.temperature = sampling_parameters.temperature + if sampling_parameters.top_p is not None: + gen_settings.top_p = sampling_parameters.top_p + if sampling_parameters.top_k is not None: + gen_settings.top_k = sampling_parameters.top_k + gen_settings.logits_processor = structure_logits_processor + stop_conditions = [self.generator.tokenizer.eos_token_id] + if isinstance(generation_parameters.stop_at, str): + stop_conditions.append(generation_parameters.stop_at) + elif isinstance(generation_parameters.stop_at, list): + for stop_at in generation_parameters.stop_at: + stop_conditions.append(stop_at) + order = {} + if isinstance(prompts, str): + prompts = [prompts] + seed = generation_parameters.seed + batch_size = len(prompts) + for idx, p in enumerate(prompts): + + input_ids = self.generator.tokenizer.encode(p, encode_special_tokens = True, add_bos = False) + + job = ExLlamaV2DynamicJob( + input_ids = input_ids, + max_new_tokens = generation_parameters.max_tokens, + min_new_tokens = exllamav2_params.min_new_tokens, + seed = exllamav2_params.seed, + stop_conditions = stop_conditions, + gen_settings = gen_settings, + token_healing = False, + decode_special_tokens = exllamav2_params.decode_special_tokens, + ) - def __call__(self, input_ids: "torch.LongTensor", *_) -> "torch.FloatTensor": - logits = self.forward(input_ids) - next_token_logits = logits[..., -1, :] + if seed is not None: seed += 1 - return next_token_logits, None + serial = self.generator.enqueue(job) + order[serial] = idx - def update_lora(self, lora_path: Optional[str] = None): - """ - Update and apply the LoRA to the model. + # Collect outputs until all jobs finish - Args: - lora_path (Optional[str]): The path to the LoRA directory. If None, the LoRA will be unloaded. - """ - try: - from exllamav2 import ExLlamaV2Lora - except ImportError: - raise ImportError( - "The `exllamav2` library needs to be installed in order to use `exllamav2` models." - ) - if lora_path is None: - if self.lora is not None: - print(" -- Unloading LoRA...") - self.lora = None - else: - self.lora = ExLlamaV2Lora.from_directory(self.model, lora_path) - print(" -- Loading LoRA...") + completions = [""] * batch_size + + def token_generator() -> Iterator[str]: + while self.generator.num_remaining_jobs(): + results = self.generator.iterate() + for r in results: + idx = order[r["serial"]] + if r["stage"] == "streaming": + all_eos = False + text = r.get("text", "") + completions[idx] += text + if r["eos"]: + completions[idx] = r + yield completions + return + + return token_generator() + + def load_lora(self, adapter_path: str): + loras = [ExLlamaV2Lora.from_directory(self.model, adapter_path)] + print(" -- Loading LoRA...") + self.generator.set_loras(loras) def exl2( model_path: str, - device: str, + draft_model_path: Optional[str] = None, max_seq_len: Optional[int] = None, - scale_pos_emb: Optional[float] = None, - scale_alpha_value: Optional[float] = None, - no_flash_attn: Optional[bool] = None, - num_experts_per_token: Optional[int] = None, cache_8bit: bool = False, cache_q4: bool = False, - tokenizer_kwargs: dict = {}, - gpu_split: Optional[str] = None, - low_mem: Optional[bool] = None, - verbose: Optional[bool] = None, + paged: bool = True, + max_chunk_size: Optional[int] = None, + lora: Optional[ExLlamaV2Lora] = None + ) -> ExLlamaV2Model: """ Load an ExLlamaV2 model. @@ -171,62 +202,54 @@ def exl2( raise ImportError( "The `exllamav2`, `transformers` and `torch` libraries needs to be installed in order to use `exllamav2` models." ) + config = ExLlamaV2Config(model_path) + if max_chunk_size is not None: + config.max_input_len = max_chunk_size + config.max_attention_size = max_chunk_size ** 2 - # Load tokenizer - if not verbose: - print(" -- Loading tokenizer...") - tokenizer_kwargs.setdefault("padding_side", "left") - tokenizer = AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs) - # tokenizer = TransformerTokenizer(model_path, **tokenizer_kwargs) - - # Check fasttensors for config - if os.name != "nt": - use_fasttensors = True - else: - use_fasttensors = False - - # Create config - config = ExLlamaV2Config() - config.model_dir = model_path - config.fasttensors = use_fasttensors - config.prepare() - - # Set config options - if max_seq_len is not None: - config.max_seq_len = max_seq_len - if scale_pos_emb is not None: - config.scale_pos_emb = scale_pos_emb - if scale_alpha_value is not None: - config.scale_alpha_value = scale_alpha_value - if no_flash_attn is not None: - config.no_flash_attn = no_flash_attn - if num_experts_per_token is not None: - config.num_experts_per_token = num_experts_per_token - if low_mem: - config.set_low_mem() - - # Prepare the model from the config + config.arch_compat_overrides() model = ExLlamaV2(config) - - # Create cache + if max_seq_len is None: + max_seq_len = -1 if cache_8bit: - cache = ExLlamaV2Cache_8bit(model, lazy=not model.loaded) + cache = ExLlamaV2Cache_8bit(model, max_seq_len=max_seq_len, lazy=True) elif cache_q4: - cache = ExLlamaV2Cache_Q4(model, lazy=not model.loaded) + cache = ExLlamaV2Cache_Q4(model, max_seq_len=max_seq_len, lazy=True) else: - cache = ExLlamaV2Cache(model, lazy=not model.loaded) - - # Load the model - split = None - if gpu_split and gpu_split != "auto": - split = [float(alloc) for alloc in gpu_split.split(",")] - if not verbose: - print(" -- Loading model...") - model.load(split) - - # Autoload if no GPU split was provided - if not model.loaded: - print(" -- Loading model...") - model.load_autosplit(cache) - - return ExLlamaV2Model(model, tokenizer, device, cache) + cache = ExLlamaV2Cache(model, max_seq_len=max_seq_len, lazy=True) + model.load_autosplit(cache, progress=True) + + print("Loading tokenizer...") + tokenizer = ExLlamaV2Tokenizer(config) + tokenizer.vocabulary = tokenizer.extended_piece_to_id + max_batch_size = 4 if paged else 1 + + draft_model = None + draft_cache = None + if draft_model_path is not None: + draft_config = ExLlamaV2Config(draft_model_path) + draft_model = ExLlamaV2(draft_config) + + if cache_8bit: + draft_cache = ExLlamaV2Cache_8bit(draft_model, max_seq_len=max_seq_len, lazy=True) + elif cache_q4: + draft_cache = ExLlamaV2Cache_Q4(draft_model, max_seq_len=max_seq_len, lazy=True) + else: + draft_cache = ExLlamaV2Cache(draft_model, max_seq_len=max_seq_len, lazy=True) + + + # Initialize the generator with all default parameters + generator = ExLlamaV2DynamicGenerator( + model = model, + cache = cache, + draft_model = draft_model, + draft_cache = draft_cache, + tokenizer = tokenizer, + max_batch_size = max_batch_size, + use_ngram_draft = False, + max_chunk_size = max_chunk_size, + paged = paged, + ) + if lora is not None: + generator.set_loras(lora) + return ExLlamaV2Model(generator) \ No newline at end of file diff --git a/tests/generate/test_generate.py b/tests/generate/test_generate.py index a86d3c253..6b8a70653 100644 --- a/tests/generate/test_generate.py +++ b/tests/generate/test_generate.py @@ -19,6 +19,13 @@ def model_llamacpp(tmp_path_factory): filename="TinyMistral-248M-v2-Instruct.Q4_K_M.gguf", ) +@pytest.fixture(scope="session") +def model_exllamav2(tmp_path_factory): + return models.exllamav2( + model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", + cache_q4=True, + paged=False + ) @pytest.fixture(scope="session") def model_mlxlm(tmp_path_factory): @@ -96,6 +103,7 @@ def model_t5(tmp_path_factory): ALL_MODEL_FIXTURES = ( "model_llamacpp", + "model_exllamav2", "model_mlxlm", "model_mlxlm_phi3", "model_transformers_random", From a68ddd7aefe0fef3b0c47df0739297e4904dc085 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Tue, 6 Aug 2024 00:53:12 -0400 Subject: [PATCH 06/28] Passed local test --- outlines/models/exllamav2.py | 146 ++++++++++++++++++++++---------- tests/generate/test_generate.py | 2 +- 2 files changed, 100 insertions(+), 48 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 369d63ad2..63daf8b8d 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -1,4 +1,5 @@ -import os +import dataclasses + from typing import ( TYPE_CHECKING, Iterator, @@ -9,43 +10,75 @@ ) from typing_extensions import Unpack +from outlines.models.transformers import TransformerTokenizer from outlines.generate.api import GenerationParameters, SamplingParameters - if TYPE_CHECKING: - from exllamav2 import ExLlamaV2, ExLlamaV2Lora, ExLlamaV2Tokenizer - from exllamav2.generator import ExLlamaV2Sampler - from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler, ExLlamaV2DynamicJob + from exllamav2 import ExLlamaV2Lora + from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler class ExllamaV2Params(TypedDict, total=False): - min_new_tokens: int = 0, - seed: Union[int, None] = None, - encode_special_tokens: bool = False, - decode_special_tokens: bool = False, - add_bos: bool = False, + max_tokens: int + stop_conditions: list[int | str] | None = None + seed: int | None = None + gen_settings: "ExLlamaV2Sampler.Settings" class ExLlamaV2Model: """Represents a `exl2` model.""" def __init__( self, - generator: ExLlamaV2DynamicGenerator, + generator: "ExLlamaV2DynamicGenerator", + tokenizer: TransformerTokenizer, + max_seq_len: int ): self.generator = generator - def generate( + self.tokenizer = tokenizer + self.max_seq_len = max_seq_len + def prepare_generation_parameters( self, prompts: Union[str, List[str]], generation_parameters: GenerationParameters, - structure_logits_processor, sampling_parameters: SamplingParameters, - **exllamav2_params: Unpack[ExllamaV2Params] - ) -> List[str]: + structure_logits_processor, + **exllamav2_params: Unpack[ExllamaV2Params], + ): + """Prepare the generation parameters. + + `exllamav2` uses different default values + + """ + try: + from exllamav2.generator import ExLlamaV2Sampler + except ImportError: + raise ImportError( + "The `exllamav2` and `torch` libraries needs to be installed in order to use `exllamav2` models." + ) + if isinstance(prompts, str): + prompts = [prompts] + max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters) + + + if max_tokens is None: + max_tokens = [] + for prompt in prompts: + ids = self.generator.tokenizer.encode(prompt, encode_special_tokens = False) + prompt_tokens = ids.shape[-1] + max_tokens.append(self.max_seq_len - prompt_tokens) + exllamav2_params["max_new_tokens"] = max_tokens + else: + exllamav2_params["max_new_tokens"] = [max_tokens for _ in range(len(prompts))] + stop_conditions = [self.generator.tokenizer.eos_token_id] if isinstance(generation_parameters.stop_at, str): stop_conditions.append(generation_parameters.stop_at) elif isinstance(generation_parameters.stop_at, list): for stop_at in generation_parameters.stop_at: stop_conditions.append(stop_at) + exllamav2_params["stop_conditions"] = stop_conditions + exllamav2_params["seed"] = seed + + gen_settings = ExLlamaV2Sampler.Settings() if sampling_parameters.temperature is not None: gen_settings.temperature = sampling_parameters.temperature @@ -54,15 +87,28 @@ def generate( if sampling_parameters.top_k is not None: gen_settings.top_k = sampling_parameters.top_k gen_settings.logits_processor = structure_logits_processor + exllamav2_params["gen_settings"] = gen_settings + + + return exllamav2_params + def generate( + self, + prompts: Union[str, List[str]], + generation_parameters: GenerationParameters, + structure_logits_processor, + sampling_parameters: SamplingParameters, + **exllamav2_params: Unpack[ExllamaV2Params] + ) -> List[str]: + exllamav2_params = self.prepare_generation_parameters(prompts, generation_parameters, sampling_parameters, structure_logits_processor) return self.generator.generate( prompt=prompts, - gen_settings=gen_settings, - max_new_tokens=generation_parameters.max_tokens, + gen_settings=exllamav2_params["gen_settings"], + max_new_tokens=min(exllamav2_params["max_new_tokens"]), completion_only=True, - encode_special_tokens=exllamav2_params.encode_special_tokens, - stop_conditions=stop_conditions, - add_bos=exllamav2_params.add_bos, - seed=generation_parameters.seed, + encode_special_tokens=False, + stop_conditions=exllamav2_params["stop_conditions"], + add_bos=False, + seed=exllamav2_params["seed"], ) def stream( self, @@ -72,38 +118,31 @@ def stream( sampling_parameters: SamplingParameters, **exllamav2_params: Unpack[ExllamaV2Params] ) -> List[Iterator[str]]: - gen_settings = ExLlamaV2Sampler.Settings() - if sampling_parameters.temperature is not None: - gen_settings.temperature = sampling_parameters.temperature - if sampling_parameters.top_p is not None: - gen_settings.top_p = sampling_parameters.top_p - if sampling_parameters.top_k is not None: - gen_settings.top_k = sampling_parameters.top_k - gen_settings.logits_processor = structure_logits_processor - stop_conditions = [self.generator.tokenizer.eos_token_id] - if isinstance(generation_parameters.stop_at, str): - stop_conditions.append(generation_parameters.stop_at) - elif isinstance(generation_parameters.stop_at, list): - for stop_at in generation_parameters.stop_at: - stop_conditions.append(stop_at) + try: + from exllamav2.generator import ExLlamaV2DynamicJob + except ImportError: + raise ImportError( + "The `exllamav2` and `torch` libraries needs to be installed in order to use `exllamav2` models." + ) + exllamav2_params = self.prepare_generation_parameters(prompts, generation_parameters, sampling_parameters, structure_logits_processor) order = {} if isinstance(prompts, str): prompts = [prompts] - seed = generation_parameters.seed batch_size = len(prompts) + seed = exllamav2_params["seed"] for idx, p in enumerate(prompts): - input_ids = self.generator.tokenizer.encode(p, encode_special_tokens = True, add_bos = False) + input_ids = self.generator.tokenizer.encode(p, encode_special_tokens = False, add_bos = False) job = ExLlamaV2DynamicJob( input_ids = input_ids, - max_new_tokens = generation_parameters.max_tokens, - min_new_tokens = exllamav2_params.min_new_tokens, - seed = exllamav2_params.seed, - stop_conditions = stop_conditions, - gen_settings = gen_settings, + max_new_tokens = exllamav2_params["max_new_tokens"][idx], + min_new_tokens = 0, + seed = seed, + stop_conditions = exllamav2_params["stop_conditions"], + gen_settings = exllamav2_params["gen_settings"], token_healing = False, - decode_special_tokens = exllamav2_params.decode_special_tokens, + decode_special_tokens = False, ) if seed is not None: seed += 1 @@ -121,7 +160,6 @@ def token_generator() -> Iterator[str]: for r in results: idx = order[r["serial"]] if r["stage"] == "streaming": - all_eos = False text = r.get("text", "") completions[idx] += text if r["eos"]: @@ -132,6 +170,12 @@ def token_generator() -> Iterator[str]: return token_generator() def load_lora(self, adapter_path: str): + try: + from exllamav2 import ExLlamaV2Lora + except ImportError: + raise ImportError( + "The `exllamav2` and `torch` libraries needs to be installed in order to use `exllamav2` models." + ) loras = [ExLlamaV2Lora.from_directory(self.model, adapter_path)] print(" -- Loading LoRA...") self.generator.set_loras(loras) @@ -145,7 +189,7 @@ def exl2( cache_q4: bool = False, paged: bool = True, max_chunk_size: Optional[int] = None, - lora: Optional[ExLlamaV2Lora] = None + lora: Optional["ExLlamaV2Lora"] = None ) -> ExLlamaV2Model: """ @@ -196,11 +240,15 @@ def exl2( ExLlamaV2Cache_8bit, ExLlamaV2Cache_Q4, ExLlamaV2Config, + ExLlamaV2Tokenizer ) + from exllamav2.generator import ExLlamaV2DynamicGenerator from transformers import AutoTokenizer + + except ImportError: raise ImportError( - "The `exllamav2`, `transformers` and `torch` libraries needs to be installed in order to use `exllamav2` models." + "The `exllamav2` and `torch` libraries needs to be installed in order to use `exllamav2` models." ) config = ExLlamaV2Config(model_path) if max_chunk_size is not None: @@ -252,4 +300,8 @@ def exl2( ) if lora is not None: generator.set_loras(lora) - return ExLlamaV2Model(generator) \ No newline at end of file + hf_tokenizer_kwargs = {} + hf_tokenizer_kwargs.setdefault("padding_side", "left") + hf_tokenizer = AutoTokenizer.from_pretrained(model_path, **hf_tokenizer_kwargs) + max_seq_len = cache.max_seq_len + return ExLlamaV2Model(generator, TransformerTokenizer(hf_tokenizer), max_seq_len) \ No newline at end of file diff --git a/tests/generate/test_generate.py b/tests/generate/test_generate.py index 6b8a70653..377065fae 100644 --- a/tests/generate/test_generate.py +++ b/tests/generate/test_generate.py @@ -21,7 +21,7 @@ def model_llamacpp(tmp_path_factory): @pytest.fixture(scope="session") def model_exllamav2(tmp_path_factory): - return models.exllamav2( + return models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", cache_q4=True, paged=False From 197718f2c22ecb78242759e68eb221dac890347d Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Tue, 13 Aug 2024 15:40:01 -0400 Subject: [PATCH 07/28] Fixed tests+precommit --- outlines/models/exllamav2.py | 159 ++++++++++++++++++++------------ outlines/types/airports.py | 9 +- tests/generate/test_generate.py | 6 +- 3 files changed, 110 insertions(+), 64 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 63daf8b8d..6b2a2286d 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -1,18 +1,11 @@ import dataclasses - -from typing import ( - TYPE_CHECKING, - Iterator, - List, - Optional, - TypedDict, - Union, -) +from typing import TYPE_CHECKING, Iterator, List, Optional, TypedDict, Union from typing_extensions import Unpack -from outlines.models.transformers import TransformerTokenizer from outlines.generate.api import GenerationParameters, SamplingParameters +from outlines.models.transformers import TransformerTokenizer + if TYPE_CHECKING: from exllamav2 import ExLlamaV2Lora from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler @@ -24,17 +17,20 @@ class ExllamaV2Params(TypedDict, total=False): seed: int | None = None gen_settings: "ExLlamaV2Sampler.Settings" + class ExLlamaV2Model: """Represents a `exl2` model.""" + def __init__( self, generator: "ExLlamaV2DynamicGenerator", tokenizer: TransformerTokenizer, - max_seq_len: int + max_seq_len: int, ): self.generator = generator self.tokenizer = tokenizer self.max_seq_len = max_seq_len + def prepare_generation_parameters( self, prompts: Union[str, List[str]], @@ -42,7 +38,7 @@ def prepare_generation_parameters( sampling_parameters: SamplingParameters, structure_logits_processor, **exllamav2_params: Unpack[ExllamaV2Params], - ): + ) -> tuple[ExllamaV2Params, Union[str, List[str]]]: """Prepare the generation parameters. `exllamav2` uses different default values @@ -58,16 +54,19 @@ def prepare_generation_parameters( prompts = [prompts] max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters) - if max_tokens is None: max_tokens = [] for prompt in prompts: - ids = self.generator.tokenizer.encode(prompt, encode_special_tokens = False) + ids = self.generator.tokenizer.encode( + prompt, encode_special_tokens=False + ) prompt_tokens = ids.shape[-1] max_tokens.append(self.max_seq_len - prompt_tokens) exllamav2_params["max_new_tokens"] = max_tokens else: - exllamav2_params["max_new_tokens"] = [max_tokens for _ in range(len(prompts))] + exllamav2_params["max_new_tokens"] = [ + max_tokens for _ in range(len(prompts)) + ] stop_conditions = [self.generator.tokenizer.eos_token_id] if isinstance(generation_parameters.stop_at, str): @@ -78,7 +77,6 @@ def prepare_generation_parameters( exllamav2_params["stop_conditions"] = stop_conditions exllamav2_params["seed"] = seed - gen_settings = ExLlamaV2Sampler.Settings() if sampling_parameters.temperature is not None: gen_settings.temperature = sampling_parameters.temperature @@ -88,19 +86,50 @@ def prepare_generation_parameters( gen_settings.top_k = sampling_parameters.top_k gen_settings.logits_processor = structure_logits_processor exllamav2_params["gen_settings"] = gen_settings + if sampling_parameters.num_samples > 1: + prompts = prompts * sampling_parameters.num_samples + + if len(prompts) == 1: + prompts = prompts[0] + return exllamav2_params, prompts + + def reformat_output( + self, output: Union[str, List[str]], sampling_parameters: SamplingParameters + ): + if isinstance(output, str): + return output + if len(output) == 1: + return output[0] + if sampling_parameters.num_samples > 1: + if len(output) == sampling_parameters.num_samples: + return output + assert len(output) % sampling_parameters.num_samples == 0 + num_items_per_sample = len(output) // sampling_parameters.num_samples + new_output = [] + for i in range(sampling_parameters.num_samples): + curr_sample = [] + for j in range(num_items_per_sample): + curr_sample.append(output[i * num_items_per_sample + j]) + new_output.append(curr_sample) + return new_output + return output - return exllamav2_params def generate( self, prompts: Union[str, List[str]], generation_parameters: GenerationParameters, structure_logits_processor, sampling_parameters: SamplingParameters, - **exllamav2_params: Unpack[ExllamaV2Params] - ) -> List[str]: - exllamav2_params = self.prepare_generation_parameters(prompts, generation_parameters, sampling_parameters, structure_logits_processor) - return self.generator.generate( + **exllamav2_params: Unpack[ExllamaV2Params], + ) -> Union[str, List[str]]: + exllamav2_params, prompts = self.prepare_generation_parameters( + prompts, + generation_parameters, + sampling_parameters, + structure_logits_processor, + ) + output = self.generator.generate( prompt=prompts, gen_settings=exllamav2_params["gen_settings"], max_new_tokens=min(exllamav2_params["max_new_tokens"]), @@ -110,49 +139,60 @@ def generate( add_bos=False, seed=exllamav2_params["seed"], ) + + return self.reformat_output(output, sampling_parameters) + def stream( self, prompts: Union[str, List[str]], generation_parameters: GenerationParameters, structure_logits_processor, sampling_parameters: SamplingParameters, - **exllamav2_params: Unpack[ExllamaV2Params] - ) -> List[Iterator[str]]: + **exllamav2_params: Unpack[ExllamaV2Params], + ) -> Iterator[Union[str, List[str]]]: try: from exllamav2.generator import ExLlamaV2DynamicJob except ImportError: raise ImportError( "The `exllamav2` and `torch` libraries needs to be installed in order to use `exllamav2` models." ) - exllamav2_params = self.prepare_generation_parameters(prompts, generation_parameters, sampling_parameters, structure_logits_processor) + exllamav2_params, prompts = self.prepare_generation_parameters( + prompts, + generation_parameters, + sampling_parameters, + structure_logits_processor, + ) + order = {} if isinstance(prompts, str): prompts = [prompts] batch_size = len(prompts) seed = exllamav2_params["seed"] for idx, p in enumerate(prompts): - - input_ids = self.generator.tokenizer.encode(p, encode_special_tokens = False, add_bos = False) + input_ids = self.generator.tokenizer.encode( + p, encode_special_tokens=False, add_bos=False + ) job = ExLlamaV2DynamicJob( - input_ids = input_ids, - max_new_tokens = exllamav2_params["max_new_tokens"][idx], - min_new_tokens = 0, - seed = seed, - stop_conditions = exllamav2_params["stop_conditions"], - gen_settings = exllamav2_params["gen_settings"], - token_healing = False, - decode_special_tokens = False, + input_ids=input_ids, + max_new_tokens=exllamav2_params["max_new_tokens"][idx], + min_new_tokens=0, + seed=seed, + stop_conditions=exllamav2_params["stop_conditions"], + gen_settings=exllamav2_params["gen_settings"], + token_healing=False, + decode_special_tokens=False, ) - if seed is not None: seed += 1 + if seed is not None: + seed += 1 serial = self.generator.enqueue(job) order[serial] = idx # Collect outputs until all jobs finish - completions = [""] * batch_size + next_text = [""] * batch_size def token_generator() -> Iterator[str]: while self.generator.num_remaining_jobs(): @@ -161,10 +201,10 @@ def token_generator() -> Iterator[str]: idx = order[r["serial"]] if r["stage"] == "streaming": text = r.get("text", "") - completions[idx] += text + next_text[idx] = text if r["eos"]: - completions[idx] = r - yield completions + next_text[idx] = "" + yield self.reformat_output(next_text, sampling_parameters) return return token_generator() @@ -189,8 +229,7 @@ def exl2( cache_q4: bool = False, paged: bool = True, max_chunk_size: Optional[int] = None, - lora: Optional["ExLlamaV2Lora"] = None - + lora: Optional["ExLlamaV2Lora"] = None, ) -> ExLlamaV2Model: """ Load an ExLlamaV2 model. @@ -240,12 +279,11 @@ def exl2( ExLlamaV2Cache_8bit, ExLlamaV2Cache_Q4, ExLlamaV2Config, - ExLlamaV2Tokenizer + ExLlamaV2Tokenizer, ) from exllamav2.generator import ExLlamaV2DynamicGenerator from transformers import AutoTokenizer - except ImportError: raise ImportError( "The `exllamav2` and `torch` libraries needs to be installed in order to use `exllamav2` models." @@ -253,7 +291,7 @@ def exl2( config = ExLlamaV2Config(model_path) if max_chunk_size is not None: config.max_input_len = max_chunk_size - config.max_attention_size = max_chunk_size ** 2 + config.max_attention_size = max_chunk_size**2 config.arch_compat_overrides() model = ExLlamaV2(config) @@ -279,24 +317,29 @@ def exl2( draft_model = ExLlamaV2(draft_config) if cache_8bit: - draft_cache = ExLlamaV2Cache_8bit(draft_model, max_seq_len=max_seq_len, lazy=True) + draft_cache = ExLlamaV2Cache_8bit( + draft_model, max_seq_len=max_seq_len, lazy=True + ) elif cache_q4: - draft_cache = ExLlamaV2Cache_Q4(draft_model, max_seq_len=max_seq_len, lazy=True) + draft_cache = ExLlamaV2Cache_Q4( + draft_model, max_seq_len=max_seq_len, lazy=True + ) else: - draft_cache = ExLlamaV2Cache(draft_model, max_seq_len=max_seq_len, lazy=True) - + draft_cache = ExLlamaV2Cache( + draft_model, max_seq_len=max_seq_len, lazy=True + ) # Initialize the generator with all default parameters generator = ExLlamaV2DynamicGenerator( - model = model, - cache = cache, - draft_model = draft_model, - draft_cache = draft_cache, - tokenizer = tokenizer, - max_batch_size = max_batch_size, - use_ngram_draft = False, - max_chunk_size = max_chunk_size, - paged = paged, + model=model, + cache=cache, + draft_model=draft_model, + draft_cache=draft_cache, + tokenizer=tokenizer, + max_batch_size=max_batch_size, + use_ngram_draft=False, + max_chunk_size=max_chunk_size, + paged=paged, ) if lora is not None: generator.set_loras(lora) @@ -304,4 +347,4 @@ def exl2( hf_tokenizer_kwargs.setdefault("padding_side", "left") hf_tokenizer = AutoTokenizer.from_pretrained(model_path, **hf_tokenizer_kwargs) max_seq_len = cache.max_seq_len - return ExLlamaV2Model(generator, TransformerTokenizer(hf_tokenizer), max_seq_len) \ No newline at end of file + return ExLlamaV2Model(generator, TransformerTokenizer(hf_tokenizer), max_seq_len) diff --git a/outlines/types/airports.py b/outlines/types/airports.py index b4ed6784d..dab134889 100644 --- a/outlines/types/airports.py +++ b/outlines/types/airports.py @@ -1,10 +1,11 @@ """Generate valid airport codes.""" from enum import Enum -from pyairports.airports import AIRPORT_LIST +# from pyairports.airports import AIRPORT_LIST -AIRPORT_IATA_LIST = list( - {(airport[3], airport[3]) for airport in AIRPORT_LIST if airport[3] != ""} -) +# AIRPORT_IATA_LIST = list( +# {(airport[3], airport[3]) for airport in AIRPORT_LIST if airport[3] != ""} +# ) +AIRPORT_IATA_LIST = [] IATA = Enum("Airport", AIRPORT_IATA_LIST) # type:ignore diff --git a/tests/generate/test_generate.py b/tests/generate/test_generate.py index 377065fae..82333794a 100644 --- a/tests/generate/test_generate.py +++ b/tests/generate/test_generate.py @@ -19,14 +19,16 @@ def model_llamacpp(tmp_path_factory): filename="TinyMistral-248M-v2-Instruct.Q4_K_M.gguf", ) + @pytest.fixture(scope="session") def model_exllamav2(tmp_path_factory): return models.exl2( - model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", + model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4.6-exl2", cache_q4=True, - paged=False + paged=False, ) + @pytest.fixture(scope="session") def model_mlxlm(tmp_path_factory): return models.mlxlm("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit") From df4bd6a8d06a561810ee6dbe38ecbe27c6f1d37f Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Tue, 13 Aug 2024 15:57:39 -0400 Subject: [PATCH 08/28] Revert change for pyairports --- outlines/models/exllamav2.py | 3 ++- outlines/types/airports.py | 9 ++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 6b2a2286d..28dc2b359 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -16,6 +16,7 @@ class ExllamaV2Params(TypedDict, total=False): stop_conditions: list[int | str] | None = None seed: int | None = None gen_settings: "ExLlamaV2Sampler.Settings" + max_new_tokens: list[int] class ExLlamaV2Model: @@ -216,7 +217,7 @@ def load_lora(self, adapter_path: str): raise ImportError( "The `exllamav2` and `torch` libraries needs to be installed in order to use `exllamav2` models." ) - loras = [ExLlamaV2Lora.from_directory(self.model, adapter_path)] + loras = [ExLlamaV2Lora.from_directory(self.generator.model, adapter_path)] print(" -- Loading LoRA...") self.generator.set_loras(loras) diff --git a/outlines/types/airports.py b/outlines/types/airports.py index dab134889..b4ed6784d 100644 --- a/outlines/types/airports.py +++ b/outlines/types/airports.py @@ -1,11 +1,10 @@ """Generate valid airport codes.""" from enum import Enum -# from pyairports.airports import AIRPORT_LIST +from pyairports.airports import AIRPORT_LIST -# AIRPORT_IATA_LIST = list( -# {(airport[3], airport[3]) for airport in AIRPORT_LIST if airport[3] != ""} -# ) -AIRPORT_IATA_LIST = [] +AIRPORT_IATA_LIST = list( + {(airport[3], airport[3]) for airport in AIRPORT_LIST if airport[3] != ""} +) IATA = Enum("Airport", AIRPORT_IATA_LIST) # type:ignore From 4ffdf34fea6b56a136bab8edefd5beb493b7f6c6 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Tue, 13 Aug 2024 16:06:53 -0400 Subject: [PATCH 09/28] Fixed precommit --- outlines/models/exllamav2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 28dc2b359..e1192b854 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -1,5 +1,5 @@ import dataclasses -from typing import TYPE_CHECKING, Iterator, List, Optional, TypedDict, Union +from typing import TYPE_CHECKING, Any, Iterator, List, Optional, TypedDict, Union from typing_extensions import Unpack @@ -13,8 +13,8 @@ class ExllamaV2Params(TypedDict, total=False): max_tokens: int - stop_conditions: list[int | str] | None = None - seed: int | None = None + stop_conditions: Optional[list[int | str]] + seed: Optional[int] gen_settings: "ExLlamaV2Sampler.Settings" max_new_tokens: list[int] @@ -344,7 +344,7 @@ def exl2( ) if lora is not None: generator.set_loras(lora) - hf_tokenizer_kwargs = {} + hf_tokenizer_kwargs: dict[str, Any] = {} hf_tokenizer_kwargs.setdefault("padding_side", "left") hf_tokenizer = AutoTokenizer.from_pretrained(model_path, **hf_tokenizer_kwargs) max_seq_len = cache.max_seq_len From 39ecf7dbd715b32303b373e68f778cf93ae554ba Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Tue, 13 Aug 2024 16:34:47 -0400 Subject: [PATCH 10/28] Wrap up --- outlines/generate/fsm.py | 3 +-- outlines/generate/regex.py | 4 +--- outlines/generate/text.py | 4 +--- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/outlines/generate/fsm.py b/outlines/generate/fsm.py index 4fcc625e5..c27c8bd2e 100644 --- a/outlines/generate/fsm.py +++ b/outlines/generate/fsm.py @@ -4,11 +4,10 @@ from outlines.fsm.guide import RegexGuide from outlines.generate.api import ( - SequenceGenerator, SequenceGeneratorAdapter, VisionSequenceGeneratorAdapter, ) -from outlines.models import ExLlamaV2Model, TransformersVision +from outlines.models import TransformersVision from outlines.samplers import Sampler, multinomial diff --git a/outlines/generate/regex.py b/outlines/generate/regex.py index a2270c7d4..673880e49 100644 --- a/outlines/generate/regex.py +++ b/outlines/generate/regex.py @@ -1,12 +1,10 @@ from functools import singledispatch -from outlines.fsm.guide import RegexGuide from outlines.generate.api import ( - SequenceGenerator, SequenceGeneratorAdapter, VisionSequenceGeneratorAdapter, ) -from outlines.models import ExLlamaV2Model, OpenAI, TransformersVision +from outlines.models import OpenAI, TransformersVision from outlines.samplers import Sampler, multinomial diff --git a/outlines/generate/text.py b/outlines/generate/text.py index fad9ea7a3..32530d0c4 100644 --- a/outlines/generate/text.py +++ b/outlines/generate/text.py @@ -1,12 +1,10 @@ from functools import singledispatch -from outlines.fsm.guide import StopAtEOSGuide from outlines.generate.api import ( - SequenceGenerator, SequenceGeneratorAdapter, VisionSequenceGeneratorAdapter, ) -from outlines.models import ExLlamaV2Model, OpenAI, TransformersVision +from outlines.models import OpenAI, TransformersVision from outlines.samplers import Sampler, multinomial From ab731bdbd7249e9a405a7e5d7587ddd11f53e115 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Tue, 13 Aug 2024 16:38:19 -0400 Subject: [PATCH 11/28] Remove | for union --- outlines/models/exllamav2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index e1192b854..23fbfd385 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -13,7 +13,7 @@ class ExllamaV2Params(TypedDict, total=False): max_tokens: int - stop_conditions: Optional[list[int | str]] + stop_conditions: Optional[list[Union[int, str]]] seed: Optional[int] gen_settings: "ExLlamaV2Sampler.Settings" max_new_tokens: list[int] From 4cda25455fa66a06788961810deac3e9b695daba Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Tue, 13 Aug 2024 16:50:49 -0400 Subject: [PATCH 12/28] Attempt changing to List --- outlines/models/exllamav2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 23fbfd385..102b13114 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -13,10 +13,10 @@ class ExllamaV2Params(TypedDict, total=False): max_tokens: int - stop_conditions: Optional[list[Union[int, str]]] + stop_conditions: Optional[List[Union[int, str]]] seed: Optional[int] gen_settings: "ExLlamaV2Sampler.Settings" - max_new_tokens: list[int] + max_new_tokens: List[int] class ExLlamaV2Model: From f402f336b97c825f6604238e10300e7832213818 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Tue, 13 Aug 2024 17:27:49 -0400 Subject: [PATCH 13/28] Fixed for 3.8 --- outlines/models/exllamav2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 102b13114..91201628d 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -1,5 +1,5 @@ import dataclasses -from typing import TYPE_CHECKING, Any, Iterator, List, Optional, TypedDict, Union +from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple, TypedDict, Union from typing_extensions import Unpack @@ -39,7 +39,7 @@ def prepare_generation_parameters( sampling_parameters: SamplingParameters, structure_logits_processor, **exllamav2_params: Unpack[ExllamaV2Params], - ) -> tuple[ExllamaV2Params, Union[str, List[str]]]: + ) -> Tuple[ExllamaV2Params, Union[str, List[str]]]: """Prepare the generation parameters. `exllamav2` uses different default values From e014a63b03ccccc48f98991608be3f4bf0afd802 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Mon, 19 Aug 2024 14:58:40 -0400 Subject: [PATCH 14/28] Adding exllamav2 to optional dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index f94b3c84d..457494735 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ test = [ "accelerate", "beartype<0.16.0", "responses", + "exllamav2", "llama-cpp-python", "mlx-lm; platform_machine == 'arm64' and sys_platform == 'darwin'", "huggingface_hub", From 06e9b64678252b863a60b9bfd7037bfc51f21bc7 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Mon, 19 Aug 2024 15:05:21 -0400 Subject: [PATCH 15/28] Fixed model --- tests/generate/test_generate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/generate/test_generate.py b/tests/generate/test_generate.py index 82333794a..d726325e6 100644 --- a/tests/generate/test_generate.py +++ b/tests/generate/test_generate.py @@ -23,7 +23,7 @@ def model_llamacpp(tmp_path_factory): @pytest.fixture(scope="session") def model_exllamav2(tmp_path_factory): return models.exl2( - model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4.6-exl2", + model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", cache_q4=True, paged=False, ) From f43e4d2226380ea2802c12e3ed62ca35664c47a7 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Mon, 19 Aug 2024 16:25:32 -0400 Subject: [PATCH 16/28] Changed to fork --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 457494735..4a19c3d5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ test = [ "accelerate", "beartype<0.16.0", "responses", - "exllamav2", + "exllamav2==git+https://github.com/lapp0/exllamav2.git@sampler-logits-processor", "llama-cpp-python", "mlx-lm; platform_machine == 'arm64' and sys_platform == 'darwin'", "huggingface_hub", From 7b29e8c47d4dd2ca260fbc2c71b9deb3b0dcaa1c Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Mon, 19 Aug 2024 16:28:05 -0400 Subject: [PATCH 17/28] Fix format --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4a19c3d5f..a1c59027d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ test = [ "accelerate", "beartype<0.16.0", "responses", - "exllamav2==git+https://github.com/lapp0/exllamav2.git@sampler-logits-processor", + "exllamav2@git+https://github.com/lapp0/exllamav2.git#egg=sampler-logits-processor", "llama-cpp-python", "mlx-lm; platform_machine == 'arm64' and sys_platform == 'darwin'", "huggingface_hub", From 8d1fca6ce426d6615adedeec0671014f7beb8a6a Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Mon, 19 Aug 2024 16:30:02 -0400 Subject: [PATCH 18/28] Changed order --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a1c59027d..efcfd5882 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,6 @@ test = [ "accelerate", "beartype<0.16.0", "responses", - "exllamav2@git+https://github.com/lapp0/exllamav2.git#egg=sampler-logits-processor", "llama-cpp-python", "mlx-lm; platform_machine == 'arm64' and sys_platform == 'darwin'", "huggingface_hub", @@ -64,6 +63,7 @@ test = [ "torch", "transformers", "pillow", + "exllamav2@git+https://github.com/lapp0/exllamav2.git#egg=sampler-logits-processor", ] serve = [ "vllm>=0.3.0", From 09e48437c4d0023e5354e4459639d55888bf02af Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Tue, 20 Aug 2024 14:19:23 -0400 Subject: [PATCH 19/28] Skip exllamav2 tests --- pyproject.toml | 2 +- tests/generate/conftest.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index efcfd5882..52588d929 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,7 @@ test = [ "torch", "transformers", "pillow", - "exllamav2@git+https://github.com/lapp0/exllamav2.git#egg=sampler-logits-processor", + "exllamav2", ] serve = [ "vllm>=0.3.0", diff --git a/tests/generate/conftest.py b/tests/generate/conftest.py index ed8830119..abd9c72a4 100644 --- a/tests/generate/conftest.py +++ b/tests/generate/conftest.py @@ -27,9 +27,11 @@ def pytest_collection_modifyitems(config, items): for item in items: if "model_fixture" in item.fixturenames: model_param = item.callspec.params.get("model_fixture", None) - if model_param.startswith( - "model_transformers_vision" - ) or model_param.startswith("model_vllm"): + if ( + model_param.startswith("model_transformers_vision") + or model_param.startswith("model_vllm") + or model_param.startswith("model_exllamav2") + ): item.add_marker(skip_marker) if not is_metal_available(): From 785d7de5ff1b59ca179b12fc110b9569d2d95104 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Thu, 29 Aug 2024 21:48:05 -0400 Subject: [PATCH 20/28] Attempt fixing coverage --- outlines/models/exllamav2.py | 41 +-- pyproject.toml | 1 + tests/generate/test_integration_exllamav2.py | 282 +++++++++++++++++++ 3 files changed, 294 insertions(+), 30 deletions(-) create mode 100644 tests/generate/test_integration_exllamav2.py diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 91201628d..6cc9c0daf 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -45,12 +45,8 @@ def prepare_generation_parameters( `exllamav2` uses different default values """ - try: - from exllamav2.generator import ExLlamaV2Sampler - except ImportError: - raise ImportError( - "The `exllamav2` and `torch` libraries needs to be installed in order to use `exllamav2` models." - ) + from exllamav2.generator import ExLlamaV2Sampler + if isinstance(prompts, str): prompts = [prompts] max_tokens, stop_at, seed = dataclasses.astuple(generation_parameters) @@ -89,6 +85,9 @@ def prepare_generation_parameters( exllamav2_params["gen_settings"] = gen_settings if sampling_parameters.num_samples > 1: prompts = prompts * sampling_parameters.num_samples + exllamav2_params["max_new_tokens"] = ( + exllamav2_params["max_new_tokens"] * sampling_parameters.num_samples + ) if len(prompts) == 1: prompts = prompts[0] @@ -151,12 +150,8 @@ def stream( sampling_parameters: SamplingParameters, **exllamav2_params: Unpack[ExllamaV2Params], ) -> Iterator[Union[str, List[str]]]: - try: - from exllamav2.generator import ExLlamaV2DynamicJob - except ImportError: - raise ImportError( - "The `exllamav2` and `torch` libraries needs to be installed in order to use `exllamav2` models." - ) + from exllamav2.generator import ExLlamaV2DynamicJob + exllamav2_params, prompts = self.prepare_generation_parameters( prompts, generation_parameters, @@ -211,12 +206,8 @@ def token_generator() -> Iterator[str]: return token_generator() def load_lora(self, adapter_path: str): - try: - from exllamav2 import ExLlamaV2Lora - except ImportError: - raise ImportError( - "The `exllamav2` and `torch` libraries needs to be installed in order to use `exllamav2` models." - ) + from exllamav2 import ExLlamaV2Lora + loras = [ExLlamaV2Lora.from_directory(self.generator.model, adapter_path)] print(" -- Loading LoRA...") self.generator.set_loras(loras) @@ -226,7 +217,6 @@ def exl2( model_path: str, draft_model_path: Optional[str] = None, max_seq_len: Optional[int] = None, - cache_8bit: bool = False, cache_q4: bool = False, paged: bool = True, max_chunk_size: Optional[int] = None, @@ -251,8 +241,6 @@ def exl2( Disable flash attention. Defaults to None. num_experts_per_token (Optional[int], optional) Number of experts per token. Defaults to None. - cache_8bit (bool, optional) - Use 8-bit cache. Defaults to False. cache_q4 (bool, optional) Use Q4 cache. Defaults to False. tokenizer_kwargs (dict, optional) @@ -277,7 +265,6 @@ def exl2( from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, - ExLlamaV2Cache_8bit, ExLlamaV2Cache_Q4, ExLlamaV2Config, ExLlamaV2Tokenizer, @@ -298,9 +285,7 @@ def exl2( model = ExLlamaV2(config) if max_seq_len is None: max_seq_len = -1 - if cache_8bit: - cache = ExLlamaV2Cache_8bit(model, max_seq_len=max_seq_len, lazy=True) - elif cache_q4: + if cache_q4: cache = ExLlamaV2Cache_Q4(model, max_seq_len=max_seq_len, lazy=True) else: cache = ExLlamaV2Cache(model, max_seq_len=max_seq_len, lazy=True) @@ -317,11 +302,7 @@ def exl2( draft_config = ExLlamaV2Config(draft_model_path) draft_model = ExLlamaV2(draft_config) - if cache_8bit: - draft_cache = ExLlamaV2Cache_8bit( - draft_model, max_seq_len=max_seq_len, lazy=True - ) - elif cache_q4: + if cache_q4: draft_cache = ExLlamaV2Cache_Q4( draft_model, max_seq_len=max_seq_len, lazy=True ) diff --git a/pyproject.toml b/pyproject.toml index 52588d929..126dab2ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ test = [ "transformers", "pillow", "exllamav2", + "flash-attn>=2.5.7", ] serve = [ "vllm>=0.3.0", diff --git a/tests/generate/test_integration_exllamav2.py b/tests/generate/test_integration_exllamav2.py new file mode 100644 index 000000000..1ee77da3a --- /dev/null +++ b/tests/generate/test_integration_exllamav2.py @@ -0,0 +1,282 @@ +import importlib +from unittest.mock import patch + +import pytest + +import outlines.models as models +from outlines.generate.api import GenerationParameters, SamplingParameters +from outlines.models.exllamav2 import ExLlamaV2Model +from outlines.models.transformers import TransformerTokenizer + + +@pytest.fixture(scope="session") +def model(tmp_path_factory): + return models.exl2( + model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", + cache_q4=True, + paged=False, + ) + + +def test_exl2_import_error(): + with patch.dict("sys.modules", {"exllamav2": None}): + with pytest.raises(ImportError): + models.exl2( + model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", + cache_q4=True, + paged=False, + ) + + +def test_model_attributes(model): + assert hasattr(model, "generator") + assert hasattr(model, "tokenizer") + assert isinstance(model.tokenizer, TransformerTokenizer) + assert hasattr(model, "max_seq_len") + assert isinstance(model.max_seq_len, int) + + +def test_model_generate_prompt_types(model): + prompt = "test" + generation_params = GenerationParameters(max_tokens=10, stop_at=None, seed=None) + structure_logits_processor = None + sampling_params = SamplingParameters( + "multinomial", + 1, + 0.9, + 50, + 1.0, + ) + output = model.generate( + prompt, generation_params, structure_logits_processor, sampling_params + ) + assert isinstance(output, str) + prompt = ["test"] + output = model.generate( + prompt, generation_params, structure_logits_processor, sampling_params + ) + assert isinstance(output, str) + + +def test_model_generate_no_max_tokens(model): + prompt = "test" + generation_params = GenerationParameters(max_tokens=None, stop_at=None, seed=None) + structure_logits_processor = None + sampling_params = SamplingParameters( + "multinomial", + 1, + 0.9, + 50, + 1.0, + ) + output = model.generate( + prompt, generation_params, structure_logits_processor, sampling_params + ) + assert isinstance(output, str) + + +def test_model_generate_test_stop_at(model): + prompt = "test" + generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) + structure_logits_processor = None + sampling_params = SamplingParameters( + "multinomial", + 1, + 0.9, + 50, + 1.0, + ) + output = model.generate( + prompt, generation_params, structure_logits_processor, sampling_params + ) + assert isinstance(output, str) + generation_params = GenerationParameters(max_tokens=10, stop_at=["stop"], seed=None) + output = model.generate( + prompt, generation_params, structure_logits_processor, sampling_params + ) + assert isinstance(output, str) + + +def test_model_generate_multisampling(model): + prompt = "test" + generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) + structure_logits_processor = None + sampling_params = SamplingParameters( + "multinomial", + 2, + ) + output = model.generate( + prompt, generation_params, structure_logits_processor, sampling_params + ) + assert isinstance(output, list) + assert isinstance(output[0], str) + + +def test_model_prepare_generation_parameters(model): + prompt = "test" + generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) + structure_logits_processor = None + sampling_params = SamplingParameters( + "multinomial", + 2, + ) + exllamav2_params, prompts = model.prepare_generation_parameters( + prompt, generation_params, sampling_params, structure_logits_processor + ) + assert isinstance(exllamav2_params, dict) + assert isinstance(prompts, list) + + +def test_model_stream_prompt_types(model): + prompt = "test" + generation_params = GenerationParameters(max_tokens=10, stop_at=None, seed=None) + structure_logits_processor = None + sampling_params = SamplingParameters( + "multinomial", + 1, + 0.9, + 50, + 1.0, + ) + generator = model.stream( + prompt, generation_params, structure_logits_processor, sampling_params + ) + for token in generator: + assert isinstance(token, str) + prompt = ["test"] + generator = model.stream( + prompt, generation_params, structure_logits_processor, sampling_params + ) + for token in generator: + assert isinstance(token, str) + + +def test_model_stream_no_max_tokens(model): + prompt = "test" + generation_params = GenerationParameters(max_tokens=None, stop_at=None, seed=None) + structure_logits_processor = None + sampling_params = SamplingParameters( + "multinomial", + 1, + 0.9, + 50, + 1.0, + ) + generator = model.stream( + prompt, generation_params, structure_logits_processor, sampling_params + ) + for token in generator: + assert isinstance(token, str) + + +def test_model_stream_test_stop_at(model): + prompt = "test" + generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) + structure_logits_processor = None + sampling_params = SamplingParameters( + "multinomial", + 1, + 0.9, + 50, + 1.0, + ) + generator = model.stream( + prompt, generation_params, structure_logits_processor, sampling_params + ) + for token in generator: + assert isinstance(token, str) + generation_params = GenerationParameters(max_tokens=10, stop_at=["stop"], seed=None) + generator = model.stream( + prompt, generation_params, structure_logits_processor, sampling_params + ) + for token in generator: + assert isinstance(token, str) + + +def test_model_stream_multisampling(model): + prompt = "test" + generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) + structure_logits_processor = None + sampling_params = SamplingParameters( + "multinomial", + 2, + ) + generator = model.stream( + prompt, generation_params, structure_logits_processor, sampling_params + ) + for token in generator: + assert isinstance(token, list) + assert isinstance(token[0], str) + + +def test_model_stream_seed(model): + prompt = "test" + generation_params = GenerationParameters(max_tokens=10, seed=1, stop_at=None) + structure_logits_processor = None + sampling_params = SamplingParameters( + "multinomial", + 1, + 0.9, + 50, + 1.0, + ) + generator = model.stream( + prompt, generation_params, structure_logits_processor, sampling_params + ) + for token in generator: + assert isinstance(token, str) + + +def test_exl2_max_chunk_size(): + model = models.exl2( + model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", + cache_q4=True, + paged=False, + max_chunk_size=128, + ) + assert isinstance(model, ExLlamaV2Model) + + +def test_exl2_cache_default(): + model = models.exl2( + model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", + paged=False, + ) + assert isinstance(model, ExLlamaV2Model) + + +def is_flash_attn_available(): + try: + importlib.import_module("flash_attn") + except (ImportError, AssertionError): + return False + return True + + +@pytest.mark.skipif(not is_flash_attn_available(), reason="flash-attn is not installed") +def test_exl2_paged(): + model = models.exl2( + model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", + cache_q4=True, + paged=True, + ) + assert isinstance(model, ExLlamaV2Model) + + +def test_exl2_draft_model(): + model = models.exl2( + model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", + draft_model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", + cache_q4=True, + paged=False, + ) + assert isinstance(model, ExLlamaV2Model) + + +def test_exl2_draft_model_cache_default(): + model = models.exl2( + model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", + draft_model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", + paged=False, + ) + assert isinstance(model, ExLlamaV2Model) From faadf5bad664ab7ebfa203e1792f4efd98e70118 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Thu, 29 Aug 2024 22:40:55 -0400 Subject: [PATCH 21/28] Attempt fix coverage --- tests/generate/test_integration_exllamav2.py | 74 ++++++++++---------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/tests/generate/test_integration_exllamav2.py b/tests/generate/test_integration_exllamav2.py index 1ee77da3a..110fb1843 100644 --- a/tests/generate/test_integration_exllamav2.py +++ b/tests/generate/test_integration_exllamav2.py @@ -10,7 +10,7 @@ @pytest.fixture(scope="session") -def model(tmp_path_factory): +def model_exllamav2(tmp_path_factory): return models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", cache_q4=True, @@ -18,7 +18,7 @@ def model(tmp_path_factory): ) -def test_exl2_import_error(): +def test_exl2_import_error(model_exllamav2): with patch.dict("sys.modules", {"exllamav2": None}): with pytest.raises(ImportError): models.exl2( @@ -28,15 +28,15 @@ def test_exl2_import_error(): ) -def test_model_attributes(model): - assert hasattr(model, "generator") - assert hasattr(model, "tokenizer") - assert isinstance(model.tokenizer, TransformerTokenizer) - assert hasattr(model, "max_seq_len") - assert isinstance(model.max_seq_len, int) +def test_model_attributes(model_exllamav2): + assert hasattr(model_exllamav2, "generator") + assert hasattr(model_exllamav2, "tokenizer") + assert isinstance(model_exllamav2.tokenizer, TransformerTokenizer) + assert hasattr(model_exllamav2, "max_seq_len") + assert isinstance(model_exllamav2.max_seq_len, int) -def test_model_generate_prompt_types(model): +def test_model_generate_prompt_types(model_exllamav2): prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at=None, seed=None) structure_logits_processor = None @@ -47,18 +47,18 @@ def test_model_generate_prompt_types(model): 50, 1.0, ) - output = model.generate( + output = model_exllamav2.generate( prompt, generation_params, structure_logits_processor, sampling_params ) assert isinstance(output, str) prompt = ["test"] - output = model.generate( + output = model_exllamav2.generate( prompt, generation_params, structure_logits_processor, sampling_params ) assert isinstance(output, str) -def test_model_generate_no_max_tokens(model): +def test_model_generate_no_max_tokens(model_exllamav2): prompt = "test" generation_params = GenerationParameters(max_tokens=None, stop_at=None, seed=None) structure_logits_processor = None @@ -69,13 +69,13 @@ def test_model_generate_no_max_tokens(model): 50, 1.0, ) - output = model.generate( + output = model_exllamav2.generate( prompt, generation_params, structure_logits_processor, sampling_params ) assert isinstance(output, str) -def test_model_generate_test_stop_at(model): +def test_model_generate_test_stop_at(model_exllamav2): prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) structure_logits_processor = None @@ -86,18 +86,18 @@ def test_model_generate_test_stop_at(model): 50, 1.0, ) - output = model.generate( + output = model_exllamav2.generate( prompt, generation_params, structure_logits_processor, sampling_params ) assert isinstance(output, str) generation_params = GenerationParameters(max_tokens=10, stop_at=["stop"], seed=None) - output = model.generate( + output = model_exllamav2.generate( prompt, generation_params, structure_logits_processor, sampling_params ) assert isinstance(output, str) -def test_model_generate_multisampling(model): +def test_model_generate_multisampling(model_exllamav2): prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) structure_logits_processor = None @@ -105,14 +105,14 @@ def test_model_generate_multisampling(model): "multinomial", 2, ) - output = model.generate( + output = model_exllamav2.generate( prompt, generation_params, structure_logits_processor, sampling_params ) assert isinstance(output, list) assert isinstance(output[0], str) -def test_model_prepare_generation_parameters(model): +def test_model_prepare_generation_parameters(model_exllamav2): prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) structure_logits_processor = None @@ -120,14 +120,14 @@ def test_model_prepare_generation_parameters(model): "multinomial", 2, ) - exllamav2_params, prompts = model.prepare_generation_parameters( + exllamav2_params, prompts = model_exllamav2.prepare_generation_parameters( prompt, generation_params, sampling_params, structure_logits_processor ) assert isinstance(exllamav2_params, dict) assert isinstance(prompts, list) -def test_model_stream_prompt_types(model): +def test_model_stream_prompt_types(model_exllamav2): prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at=None, seed=None) structure_logits_processor = None @@ -138,20 +138,20 @@ def test_model_stream_prompt_types(model): 50, 1.0, ) - generator = model.stream( + generator = model_exllamav2.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: assert isinstance(token, str) prompt = ["test"] - generator = model.stream( + generator = model_exllamav2.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: assert isinstance(token, str) -def test_model_stream_no_max_tokens(model): +def test_model_stream_no_max_tokens(model_exllamav2): prompt = "test" generation_params = GenerationParameters(max_tokens=None, stop_at=None, seed=None) structure_logits_processor = None @@ -162,14 +162,14 @@ def test_model_stream_no_max_tokens(model): 50, 1.0, ) - generator = model.stream( + generator = model_exllamav2.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: assert isinstance(token, str) -def test_model_stream_test_stop_at(model): +def test_model_stream_test_stop_at(model_exllamav2): prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) structure_logits_processor = None @@ -180,20 +180,20 @@ def test_model_stream_test_stop_at(model): 50, 1.0, ) - generator = model.stream( + generator = model_exllamav2.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: assert isinstance(token, str) generation_params = GenerationParameters(max_tokens=10, stop_at=["stop"], seed=None) - generator = model.stream( + generator = model_exllamav2.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: assert isinstance(token, str) -def test_model_stream_multisampling(model): +def test_model_stream_multisampling(model_exllamav2): prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) structure_logits_processor = None @@ -201,7 +201,7 @@ def test_model_stream_multisampling(model): "multinomial", 2, ) - generator = model.stream( + generator = model_exllamav2.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: @@ -209,7 +209,7 @@ def test_model_stream_multisampling(model): assert isinstance(token[0], str) -def test_model_stream_seed(model): +def test_model_stream_seed(model_exllamav2): prompt = "test" generation_params = GenerationParameters(max_tokens=10, seed=1, stop_at=None) structure_logits_processor = None @@ -220,14 +220,14 @@ def test_model_stream_seed(model): 50, 1.0, ) - generator = model.stream( + generator = model_exllamav2.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: assert isinstance(token, str) -def test_exl2_max_chunk_size(): +def test_exl2_max_chunk_size(model_exllamav2): model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", cache_q4=True, @@ -237,7 +237,7 @@ def test_exl2_max_chunk_size(): assert isinstance(model, ExLlamaV2Model) -def test_exl2_cache_default(): +def test_exl2_cache_default(model_exllamav2): model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", paged=False, @@ -254,7 +254,7 @@ def is_flash_attn_available(): @pytest.mark.skipif(not is_flash_attn_available(), reason="flash-attn is not installed") -def test_exl2_paged(): +def test_exl2_paged(model_exllamav2): model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", cache_q4=True, @@ -263,7 +263,7 @@ def test_exl2_paged(): assert isinstance(model, ExLlamaV2Model) -def test_exl2_draft_model(): +def test_exl2_draft_model(model_exllamav2): model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", draft_model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", @@ -273,7 +273,7 @@ def test_exl2_draft_model(): assert isinstance(model, ExLlamaV2Model) -def test_exl2_draft_model_cache_default(): +def test_exl2_draft_model_cache_default(model_exllamav2): model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", draft_model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", From 7ca151ce7796fea51cce22f842bfa79d314b8443 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Thu, 29 Aug 2024 22:48:58 -0400 Subject: [PATCH 22/28] Remove flash-attn requirement --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f61c4229d..82b01c4f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,6 @@ test = [ "transformers", "pillow", "exllamav2", - "flash-attn>=2.5.7", ] serve = [ "vllm>=0.3.0", From 2c241ff697b07e3e4316a6c33ed8d50f8eff3674 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Thu, 29 Aug 2024 23:29:17 -0400 Subject: [PATCH 23/28] Fixed fixture tests --- outlines/models/exllamav2.py | 10 -- tests/generate/test_integration_exllamav2.py | 106 ++++++++++++------- 2 files changed, 70 insertions(+), 46 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 6cc9c0daf..dc1a48ffb 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -205,13 +205,6 @@ def token_generator() -> Iterator[str]: return token_generator() - def load_lora(self, adapter_path: str): - from exllamav2 import ExLlamaV2Lora - - loras = [ExLlamaV2Lora.from_directory(self.generator.model, adapter_path)] - print(" -- Loading LoRA...") - self.generator.set_loras(loras) - def exl2( model_path: str, @@ -220,7 +213,6 @@ def exl2( cache_q4: bool = False, paged: bool = True, max_chunk_size: Optional[int] = None, - lora: Optional["ExLlamaV2Lora"] = None, ) -> ExLlamaV2Model: """ Load an ExLlamaV2 model. @@ -323,8 +315,6 @@ def exl2( max_chunk_size=max_chunk_size, paged=paged, ) - if lora is not None: - generator.set_loras(lora) hf_tokenizer_kwargs: dict[str, Any] = {} hf_tokenizer_kwargs.setdefault("padding_side", "left") hf_tokenizer = AutoTokenizer.from_pretrained(model_path, **hf_tokenizer_kwargs) diff --git a/tests/generate/test_integration_exllamav2.py b/tests/generate/test_integration_exllamav2.py index 110fb1843..7465220cb 100644 --- a/tests/generate/test_integration_exllamav2.py +++ b/tests/generate/test_integration_exllamav2.py @@ -18,7 +18,8 @@ def model_exllamav2(tmp_path_factory): ) -def test_exl2_import_error(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_exl2_import_error(request, model_fixture): with patch.dict("sys.modules", {"exllamav2": None}): with pytest.raises(ImportError): models.exl2( @@ -28,15 +29,19 @@ def test_exl2_import_error(model_exllamav2): ) -def test_model_attributes(model_exllamav2): - assert hasattr(model_exllamav2, "generator") - assert hasattr(model_exllamav2, "tokenizer") - assert isinstance(model_exllamav2.tokenizer, TransformerTokenizer) - assert hasattr(model_exllamav2, "max_seq_len") - assert isinstance(model_exllamav2.max_seq_len, int) +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_model_attributes(request, model_fixture): + model = request.getfixturevalue(model_fixture) + assert hasattr(model, "generator") + assert hasattr(model, "tokenizer") + assert isinstance(model.tokenizer, TransformerTokenizer) + assert hasattr(model, "max_seq_len") + assert isinstance(model.max_seq_len, int) -def test_model_generate_prompt_types(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_model_generate_prompt_types(request, model_fixture): + model = request.getfixturevalue(model_fixture) prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at=None, seed=None) structure_logits_processor = None @@ -47,18 +52,20 @@ def test_model_generate_prompt_types(model_exllamav2): 50, 1.0, ) - output = model_exllamav2.generate( + output = model.generate( prompt, generation_params, structure_logits_processor, sampling_params ) assert isinstance(output, str) prompt = ["test"] - output = model_exllamav2.generate( + output = model.generate( prompt, generation_params, structure_logits_processor, sampling_params ) assert isinstance(output, str) -def test_model_generate_no_max_tokens(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_model_generate_no_max_tokens(request, model_fixture): + model = request.getfixturevalue(model_fixture) prompt = "test" generation_params = GenerationParameters(max_tokens=None, stop_at=None, seed=None) structure_logits_processor = None @@ -69,13 +76,15 @@ def test_model_generate_no_max_tokens(model_exllamav2): 50, 1.0, ) - output = model_exllamav2.generate( + output = model.generate( prompt, generation_params, structure_logits_processor, sampling_params ) assert isinstance(output, str) -def test_model_generate_test_stop_at(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_model_generate_test_stop_at(request, model_fixture): + model = request.getfixturevalue(model_fixture) prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) structure_logits_processor = None @@ -86,18 +95,20 @@ def test_model_generate_test_stop_at(model_exllamav2): 50, 1.0, ) - output = model_exllamav2.generate( + output = model.generate( prompt, generation_params, structure_logits_processor, sampling_params ) assert isinstance(output, str) generation_params = GenerationParameters(max_tokens=10, stop_at=["stop"], seed=None) - output = model_exllamav2.generate( + output = model.generate( prompt, generation_params, structure_logits_processor, sampling_params ) assert isinstance(output, str) -def test_model_generate_multisampling(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_model_generate_multisampling(request, model_fixture): + model = request.getfixturevalue(model_fixture) prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) structure_logits_processor = None @@ -105,14 +116,16 @@ def test_model_generate_multisampling(model_exllamav2): "multinomial", 2, ) - output = model_exllamav2.generate( + output = model.generate( prompt, generation_params, structure_logits_processor, sampling_params ) assert isinstance(output, list) assert isinstance(output[0], str) -def test_model_prepare_generation_parameters(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_model_prepare_generation_parameters(request, model_fixture): + model = request.getfixturevalue(model_fixture) prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) structure_logits_processor = None @@ -120,14 +133,16 @@ def test_model_prepare_generation_parameters(model_exllamav2): "multinomial", 2, ) - exllamav2_params, prompts = model_exllamav2.prepare_generation_parameters( + exllamav2_params, prompts = model.prepare_generation_parameters( prompt, generation_params, sampling_params, structure_logits_processor ) assert isinstance(exllamav2_params, dict) assert isinstance(prompts, list) -def test_model_stream_prompt_types(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_model_stream_prompt_types(request, model_fixture): + model = request.getfixturevalue(model_fixture) prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at=None, seed=None) structure_logits_processor = None @@ -138,20 +153,22 @@ def test_model_stream_prompt_types(model_exllamav2): 50, 1.0, ) - generator = model_exllamav2.stream( + generator = model.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: assert isinstance(token, str) prompt = ["test"] - generator = model_exllamav2.stream( + generator = model.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: assert isinstance(token, str) -def test_model_stream_no_max_tokens(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_model_stream_no_max_tokens(request, model_fixture): + model = request.getfixturevalue(model_fixture) prompt = "test" generation_params = GenerationParameters(max_tokens=None, stop_at=None, seed=None) structure_logits_processor = None @@ -162,14 +179,16 @@ def test_model_stream_no_max_tokens(model_exllamav2): 50, 1.0, ) - generator = model_exllamav2.stream( + generator = model.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: assert isinstance(token, str) -def test_model_stream_test_stop_at(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_model_stream_test_stop_at(request, model_fixture): + model = request.getfixturevalue(model_fixture) prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) structure_logits_processor = None @@ -180,20 +199,22 @@ def test_model_stream_test_stop_at(model_exllamav2): 50, 1.0, ) - generator = model_exllamav2.stream( + generator = model.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: assert isinstance(token, str) generation_params = GenerationParameters(max_tokens=10, stop_at=["stop"], seed=None) - generator = model_exllamav2.stream( + generator = model.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: assert isinstance(token, str) -def test_model_stream_multisampling(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_model_stream_multisampling(request, model_fixture): + model = request.getfixturevalue(model_fixture) prompt = "test" generation_params = GenerationParameters(max_tokens=10, stop_at="stop", seed=None) structure_logits_processor = None @@ -201,7 +222,7 @@ def test_model_stream_multisampling(model_exllamav2): "multinomial", 2, ) - generator = model_exllamav2.stream( + generator = model.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: @@ -209,7 +230,9 @@ def test_model_stream_multisampling(model_exllamav2): assert isinstance(token[0], str) -def test_model_stream_seed(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_model_stream_seed(request, model_fixture): + model = request.getfixturevalue(model_fixture) prompt = "test" generation_params = GenerationParameters(max_tokens=10, seed=1, stop_at=None) structure_logits_processor = None @@ -220,14 +243,16 @@ def test_model_stream_seed(model_exllamav2): 50, 1.0, ) - generator = model_exllamav2.stream( + generator = model.stream( prompt, generation_params, structure_logits_processor, sampling_params ) for token in generator: assert isinstance(token, str) -def test_exl2_max_chunk_size(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_exl2_max_chunk_size(request, model_fixture): + model = request.getfixturevalue(model_fixture) model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", cache_q4=True, @@ -237,7 +262,9 @@ def test_exl2_max_chunk_size(model_exllamav2): assert isinstance(model, ExLlamaV2Model) -def test_exl2_cache_default(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_exl2_cache_default(request, model_fixture): + model = request.getfixturevalue(model_fixture) model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", paged=False, @@ -245,6 +272,7 @@ def test_exl2_cache_default(model_exllamav2): assert isinstance(model, ExLlamaV2Model) +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) def is_flash_attn_available(): try: importlib.import_module("flash_attn") @@ -254,7 +282,9 @@ def is_flash_attn_available(): @pytest.mark.skipif(not is_flash_attn_available(), reason="flash-attn is not installed") -def test_exl2_paged(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_exl2_paged(request, model_fixture): + model = request.getfixturevalue(model_fixture) model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", cache_q4=True, @@ -263,7 +293,9 @@ def test_exl2_paged(model_exllamav2): assert isinstance(model, ExLlamaV2Model) -def test_exl2_draft_model(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_exl2_draft_model(request, model_fixture): + model = request.getfixturevalue(model_fixture) model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", draft_model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", @@ -273,7 +305,9 @@ def test_exl2_draft_model(model_exllamav2): assert isinstance(model, ExLlamaV2Model) -def test_exl2_draft_model_cache_default(model_exllamav2): +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_exl2_draft_model_cache_default(request, model_fixture): + model = request.getfixturevalue(model_fixture) model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", draft_model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", From a289b5a27e17970116b63992789a30afb6c74c14 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Thu, 29 Aug 2024 23:31:28 -0400 Subject: [PATCH 24/28] Removed lora --- outlines/models/exllamav2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index dc1a48ffb..e61e369ff 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -7,7 +7,6 @@ from outlines.models.transformers import TransformerTokenizer if TYPE_CHECKING: - from exllamav2 import ExLlamaV2Lora from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler From c3681a80e9892b8685359fba976a8c626607ca5c Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Fri, 30 Aug 2024 02:08:49 -0400 Subject: [PATCH 25/28] Passed coverage --- tests/generate/test_integration_exllamav2.py | 58 ++++++++++++++++++-- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/tests/generate/test_integration_exllamav2.py b/tests/generate/test_integration_exllamav2.py index 7465220cb..b54259cdf 100644 --- a/tests/generate/test_integration_exllamav2.py +++ b/tests/generate/test_integration_exllamav2.py @@ -251,8 +251,49 @@ def test_model_stream_seed(request, model_fixture): @pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) -def test_exl2_max_chunk_size(request, model_fixture): +def test_reformat_output(request, model_fixture): model = request.getfixturevalue(model_fixture) + sampling_params = SamplingParameters( + "multinomial", + 1, + ) + output = "test" + reformatted_output = model.reformat_output(output, sampling_params) + assert reformatted_output == output + output = ["test"] + reformatted_output = model.reformat_output(output, sampling_params) + assert reformatted_output == output[0] + output = ["test", "test"] + sampling_params = SamplingParameters( + "multinomial", + 1, + ) + reformatted_output = model.reformat_output(output, sampling_params) + assert len(reformatted_output) == 2 + assert reformatted_output[0] == "test" + assert reformatted_output[1] == "test" + output = ["test", "test"] + sampling_params = SamplingParameters( + "multinomial", + 2, + ) + reformatted_output = model.reformat_output(output, sampling_params) + assert len(reformatted_output) == 2 + assert reformatted_output[0] == "test" + assert reformatted_output[1] == "test" + output = ["test", "test", "test", "test"] + sampling_params = SamplingParameters( + "multinomial", + 2, + ) + reformatted_output = model.reformat_output(output, sampling_params) + assert len(reformatted_output) == 2 + assert reformatted_output[0] == ["test", "test"] + assert reformatted_output[1] == ["test", "test"] + + +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_exl2_max_chunk_size(request, model_fixture): model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", cache_q4=True, @@ -264,7 +305,6 @@ def test_exl2_max_chunk_size(request, model_fixture): @pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) def test_exl2_cache_default(request, model_fixture): - model = request.getfixturevalue(model_fixture) model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", paged=False, @@ -284,7 +324,6 @@ def is_flash_attn_available(): @pytest.mark.skipif(not is_flash_attn_available(), reason="flash-attn is not installed") @pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) def test_exl2_paged(request, model_fixture): - model = request.getfixturevalue(model_fixture) model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", cache_q4=True, @@ -295,7 +334,6 @@ def test_exl2_paged(request, model_fixture): @pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) def test_exl2_draft_model(request, model_fixture): - model = request.getfixturevalue(model_fixture) model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", draft_model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", @@ -307,10 +345,20 @@ def test_exl2_draft_model(request, model_fixture): @pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) def test_exl2_draft_model_cache_default(request, model_fixture): - model = request.getfixturevalue(model_fixture) model = models.exl2( model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", draft_model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", paged=False, ) assert isinstance(model, ExLlamaV2Model) + + +@pytest.mark.parametrize("model_fixture", ["model_exllamav2"]) +def test_exl2_set_max_seq_len(request, model_fixture): + model = models.exl2( + model_path="blockblockblock/TinyLlama-1.1B-Chat-v1.0-bpw4-exl2", + max_seq_len=2048, + paged=False, + cache_q4=True, + ) + assert isinstance(model, ExLlamaV2Model) From e6b3af6b666206b22756f1b3abefe0bb5499d3f5 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Thu, 19 Sep 2024 20:29:22 -0400 Subject: [PATCH 26/28] Added back transformers install --- outlines/models/exllamav2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index e61e369ff..607a7b0d1 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -265,7 +265,7 @@ def exl2( except ImportError: raise ImportError( - "The `exllamav2` and `torch` libraries needs to be installed in order to use `exllamav2` models." + "The `exllamav2`, `transformers` and `torch` libraries needs to be installed in order to use `exllamav2` models." ) config = ExLlamaV2Config(model_path) if max_chunk_size is not None: From 5508c922debeff48cdc843e5ebdbff4696a0dedc Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Fri, 20 Sep 2024 13:44:57 -0400 Subject: [PATCH 27/28] Fixed per review --- outlines/models/exllamav2.py | 43 ++++++++++++++------ tests/generate/test_integration_exllamav2.py | 2 - 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/outlines/models/exllamav2.py b/outlines/models/exllamav2.py index 607a7b0d1..f06b7e46e 100644 --- a/outlines/models/exllamav2.py +++ b/outlines/models/exllamav2.py @@ -1,12 +1,12 @@ import dataclasses -from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple, TypedDict, Union +from typing import TYPE_CHECKING, Iterator, List, Optional, Tuple, TypedDict, Union from typing_extensions import Unpack from outlines.generate.api import GenerationParameters, SamplingParameters -from outlines.models.transformers import TransformerTokenizer if TYPE_CHECKING: + from exllamav2 import ExLlamaV2Tokenizer from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler @@ -24,7 +24,7 @@ class ExLlamaV2Model: def __init__( self, generator: "ExLlamaV2DynamicGenerator", - tokenizer: TransformerTokenizer, + tokenizer: "ExLlamaV2Tokenizer", max_seq_len: int, ): self.generator = generator @@ -54,7 +54,7 @@ def prepare_generation_parameters( max_tokens = [] for prompt in prompts: ids = self.generator.tokenizer.encode( - prompt, encode_special_tokens=False + prompt, encode_special_tokens=True ) prompt_tokens = ids.shape[-1] max_tokens.append(self.max_seq_len - prompt_tokens) @@ -96,6 +96,14 @@ def prepare_generation_parameters( def reformat_output( self, output: Union[str, List[str]], sampling_parameters: SamplingParameters ): + """ + The purpose of this function is to reformat the output from exllamav2's output format to outline's output format + For exllamav2, it mainly accepts only a list or a string(they also do cfg sampling with tuples but we will ignore this for now) + The exllamav2's logic is + 1. If the prompt is a string, return a string. This is the same as outlines + 2. If a prompt is a list, return a list. This is not the same as outlines output in that if the list is only one element, the string is expected to be outputted. + 3. There is no such thing as num_samples, so the prompts had to be duplicated by num_samples times. Then, we had the function output a list of lists + """ if isinstance(output, str): return output if len(output) == 1: @@ -128,12 +136,19 @@ def generate( sampling_parameters, structure_logits_processor, ) + """ + In exllamav2, it needs the max amount of new tokens generated. + The reason exllamav2_params["max_new_tokens"] is a list is because in prepare_generation_parameters + the max amount of tokens that can be generated by the model for each prompt(by encoding with tokenizer) is calculated. + The minimum is picked because otherwise it might be possible for one of the + prompts to exceed the max sequence length. + """ output = self.generator.generate( prompt=prompts, gen_settings=exllamav2_params["gen_settings"], max_new_tokens=min(exllamav2_params["max_new_tokens"]), completion_only=True, - encode_special_tokens=False, + encode_special_tokens=True, stop_conditions=exllamav2_params["stop_conditions"], add_bos=False, seed=exllamav2_params["seed"], @@ -165,7 +180,7 @@ def stream( seed = exllamav2_params["seed"] for idx, p in enumerate(prompts): input_ids = self.generator.tokenizer.encode( - p, encode_special_tokens=False, add_bos=False + p, encode_special_tokens=True, add_bos=False ) job = ExLlamaV2DynamicJob( @@ -205,6 +220,14 @@ def token_generator() -> Iterator[str]: return token_generator() +# Taken from https://github.com/lapp0/exllamav2/pull/1/files#diff-26f303de07c10aad998e33d3df52581643673a598162cc4b35ef051f52d7c60b +def patch_tokenizer(tokenizer): + tokenizer.vocabulary = tokenizer.piece_to_id + tokenizer.special_tokens = set(tokenizer.extended_piece_to_id) + tokenizer.convert_token_to_string = lambda t: t + return tokenizer + + def exl2( model_path: str, draft_model_path: Optional[str] = None, @@ -261,7 +284,6 @@ def exl2( ExLlamaV2Tokenizer, ) from exllamav2.generator import ExLlamaV2DynamicGenerator - from transformers import AutoTokenizer except ImportError: raise ImportError( @@ -284,7 +306,7 @@ def exl2( print("Loading tokenizer...") tokenizer = ExLlamaV2Tokenizer(config) - tokenizer.vocabulary = tokenizer.extended_piece_to_id + tokenizer = patch_tokenizer(tokenizer) max_batch_size = 4 if paged else 1 draft_model = None @@ -314,8 +336,5 @@ def exl2( max_chunk_size=max_chunk_size, paged=paged, ) - hf_tokenizer_kwargs: dict[str, Any] = {} - hf_tokenizer_kwargs.setdefault("padding_side", "left") - hf_tokenizer = AutoTokenizer.from_pretrained(model_path, **hf_tokenizer_kwargs) max_seq_len = cache.max_seq_len - return ExLlamaV2Model(generator, TransformerTokenizer(hf_tokenizer), max_seq_len) + return ExLlamaV2Model(generator, tokenizer, max_seq_len) diff --git a/tests/generate/test_integration_exllamav2.py b/tests/generate/test_integration_exllamav2.py index b54259cdf..51359a0e5 100644 --- a/tests/generate/test_integration_exllamav2.py +++ b/tests/generate/test_integration_exllamav2.py @@ -6,7 +6,6 @@ import outlines.models as models from outlines.generate.api import GenerationParameters, SamplingParameters from outlines.models.exllamav2 import ExLlamaV2Model -from outlines.models.transformers import TransformerTokenizer @pytest.fixture(scope="session") @@ -34,7 +33,6 @@ def test_model_attributes(request, model_fixture): model = request.getfixturevalue(model_fixture) assert hasattr(model, "generator") assert hasattr(model, "tokenizer") - assert isinstance(model.tokenizer, TransformerTokenizer) assert hasattr(model, "max_seq_len") assert isinstance(model.max_seq_len, int) From b7e92a18b56190e113ee7a25f8faa3d06f5760cb Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Fri, 20 Sep 2024 14:08:30 -0400 Subject: [PATCH 28/28] Made coverage 100% --- tests/generate/test_integration_exllamav2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/generate/test_integration_exllamav2.py b/tests/generate/test_integration_exllamav2.py index 51359a0e5..12c4143b3 100644 --- a/tests/generate/test_integration_exllamav2.py +++ b/tests/generate/test_integration_exllamav2.py @@ -33,6 +33,7 @@ def test_model_attributes(request, model_fixture): model = request.getfixturevalue(model_fixture) assert hasattr(model, "generator") assert hasattr(model, "tokenizer") + assert model.tokenizer.convert_token_to_string(1) == 1 assert hasattr(model, "max_seq_len") assert isinstance(model.max_seq_len, int)