Merge pull request #40 from remichu-ai/transformer_multimodal

remichu-ai · web-flow · commit 1e5785e1e995 · 2024-11-24T21:55:52.000+08:00
Qwen 2 VL
diff --git a/README.md b/README.md
@@ -24,9 +24,10 @@ Do checkout [TabbyAPI](https://github.com/theroyallab/tabbyAPI) if you want a re
 
 # NEW - Vision Model
 
-From `gallama` version 0.0.7, there is experimental support for Vision model. 
+As of v0.0.8post1, Qwen 2 VL (Image only, no Video) and Pixtral are supported via Exllama (>=0.2.4). 
 
-Currently, as of v0.0.8, Pixtral is supported via Exllama (>=0.2.4) and Qwen 2 VL series of model is supported via transformers.
+For Pixtral, please install Exllama V2 `v0.2.4` onwards
+For Exllama V2, please install `dev` branch of Exllama V2 as the code is not yet merged to `v0.2.4`.
 
 After Exllama roll out support for Qwen 2 VL, running model via transformers will be depreciated.
 Currently, both exllamaV2 and llama.cpp do not support Vision model yet. Hence, this is achieved by running `transformers` with the use of awq for quantization.
@@ -49,16 +50,16 @@ This is already be handled in the requirements.txt, however, getting transformer
 After installation you can download by following commands (choose a version that fit your VRAM):
 ```shell
 # 2B model
-gallama download qwen-2-VL-2B:4.0 --backend=transformers
-gallama run qwen-2-VL-2B_transformers
+gallama download qwen-2-VL-2B:4.0
+gallama run qwen-2-VL-2B
 
 # 7B model
-gallama download qwen-2-VL-7B:4.0 --backend=transformers
-gallama run qwen-2-VL-7B_transformers
+gallama download qwen-2-VL-7B:4.0
+gallama run qwen-2-VL-7B
 
 # 72B model
-gallama download qwen-2-VL-72B:4.0 --backend=transformers
-gallama run qwen-2-VL-72B_transformers
+gallama download qwen-2-VL-72B:4.0
+gallama run qwen-2-VL-72B
 ```
 
 If you need an UI to run it, check out Gallama UI, it is working with images, however, the support is not perfect at the moment:
@@ -131,12 +132,12 @@ gallama list available
 
 **Vision Large Language Models**
 
-| Model         | Backend      | Available Quantizations (bpw)                                                          |
-|---------------|--------------|----------------------------------------------------------------------------------------|
-| qwen-2-VL-2B  | transformers | `4.0`, `16.0`                                                                          |
-| qwen-2-VL-7B  | transformers | `4.0`, `16.0`                                                                          |
-| qwen-2-VL-72B | transformers | `4.0`, `16.0`                                                                          |
-| pixtral               | exllama      | `2.5`, `3.0`, `3.5`, `4.0`, `4.5`, `5.0`, `6.0`, `8.0`                                 |
+| Model         | Backend      | Available Quantizations (bpw)                          |
+|---------------|--------------|--------------------------------------------------------|
+| qwen-2-VL-2B  | exllama | `3.0`, `3.5`, `4.0`, `4.5` ,`5.0`, `6.0`, `8.0`        |
+| qwen-2-VL-7B  | exllama | `3.0`, `3.5`, `4.0`, `4.5` ,`5.0`, `6.0`, `8.0`                                          |
+| qwen-2-VL-72B | exllama | `3.0`, `3.5`, `4.0`, `4.5` ,`5.0`, `6.0`, `8.0`                                        |
+| pixtral               | exllama      | `2.5`, `3.0`, `3.5`, `4.0`, `4.5`, `5.0`, `6.0`, `8.0` |
 
 
 **Embedding Models:**
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "gallama"
-version = "0.0.8"
+version = "0.0.8post1"
 description = "An opinionated Llama Server engine with a focus on agentic tasks"
 authors = [{name = "David", email = "trantrungduc91@example.com"}]
 license = {text = "MIT"}
diff --git a/src/gallama/backend/chatgenerator.py b/src/gallama/backend/chatgenerator.py
@@ -29,6 +29,7 @@
 from qwen_vl_utils import process_vision_info
 from .model_support.llama3_2_vision.text_streamer import CustomTextIteratorStreamer
 from ..utils.utils import get_image
+from functools import lru_cache
 
 try:
     from formatron.formatter import FormatterBuilder
@@ -597,6 +598,19 @@ def get_stop_word(text, stop_words) -> Union[str, None]:
 
         return None
 
+    @staticmethod
+    @lru_cache(128)     # TODO set this dynamically
+    def get_image_embedding_cached(processor, model, tokenizer, url):
+        img = get_image(url=url)
+
+        return processor.get_image_embeddings(
+            model=model,
+            tokenizer=tokenizer,
+            image=img,
+            text_alias=None,    # passing None will let me model generate its one embedding
+        )
+
+
     async def generate(
         self,
         prompt: str,
@@ -617,24 +631,6 @@ async def generate(
     ) -> (str, GenerationStats):
         try:
 
-            def extract_uuid_strings(text):
-                """
-                Extract all strings matching the format '{{IMG-<uuid-like-hex>}}'
-
-                Args:
-                    text (str): Input string to search for matching patterns
-
-                Returns:
-                    list: List of all matching strings found in the input text
-                """
-                # Pattern to match strings like '{{IMG-<uuid-hex>}}'
-                pattern = r'\{\{IMG-[0-9a-f]{32}\}\}'
-
-                # Find all matching occurrences in the text
-                matches = re.findall(pattern, text)
-
-                return matches
-
             # ensure that generator is initialized
             if self.pipeline is None:
                 self.pipeline = await self._get_pipeline_async()
@@ -679,21 +675,35 @@ def extract_uuid_strings(text):
 
             image_embeddings = None
             if vision_required and self.processor:
-                image_token_list = extract_uuid_strings(prompt)     # extract all the placeholder token used for img placeholder
+                # count the number of image placeholder token
+                image_token = "{{IMG-PlaceHolderTokenHere}}"    # TODO move to a constant
+                image_token_count = prompt.count(image_token)
 
-                assert len(image_token_list) == len(
-                    image_list), f"Mismatch in image tokens and images: {len(image_token_list)} tokens vs {len(image_list)} images"
+                # raise error if the img token count and image to embed not match
+                assert image_token_count == len(
+                    image_list), f"Mismatch in image tokens and images: {image_token_count} tokens vs {len(image_list)} images"
 
                 # Convert image(s) to embeddings
+
                 image_embeddings = [
-                    self.processor.get_image_embeddings(
+                    self.get_image_embedding_cached(
+                        processor=self.processor,
                         model=self.model,
                         tokenizer=self.tokenizer,
-                        image=img,
-                        text_alias=alias,
+                        url=url,
                     )
-                    for (alias, img) in zip(image_token_list, [get_image(url=url) for url in image_list])
+
+                    for url in image_list
                 ]
+                # logger.info(self.get_image_embedding_cached.cache_info())
+
+                # replace embedding
+                for emb in image_embeddings:
+                    prompt = prompt.replace(image_token, emb.text_alias, 1) # replace one token with 1 embedding sequentially
+                    # logger.info(emb.text_alias)
+
+                # logger.info(prompt)
+
             elif vision_required and not self.processor:
                 if version('exllamav2') < '0.2.4':
                     raise Exception(f"Current Exllama version of {version('exllamav2')} do not support Vision model")
diff --git a/src/gallama/backend/prompt_engine.py b/src/gallama/backend/prompt_engine.py
@@ -274,7 +274,8 @@ def convert_multimodal_content_list_to_string(
                     content_str += self.get_vision_start_token() + self.get_image_pad_token() + self.get_vision_end_token()   # TODO
                 else:
                     # use a standard token as place holder, TODO - refractor
-                    content_str += "{{IMG-" + f"{uuid.uuid4().hex}" + "}}"
+                    # content_str += "{{IMG-" + f"{uuid.uuid4().hex}" + "}}"
+                    content_str += "{{IMG-PlaceHolderTokenHere}}"   #TODO use a constant instead
             else:
                 raise ValueError("Unexpected content type ")
 
diff --git a/src/gallama/data/default_model_list.yaml b/src/gallama/data/default_model_list.yaml
@@ -633,64 +633,76 @@ qwen-2-VL-2B:
   default_cache_quant: Q4
   prompt_template: Qwen2-VL
   repo:
-    - repo: "Qwen/Qwen2-VL-2B-Instruct-AWQ"
-      branch: ['main']
-      quant: [4.0]
-      backend: transformers
-      transformers_args:
-        model_class: "transformers.Qwen2VLForConditionalGeneration"
-        tokenizer_class: "transformers.AutoTokenizer"
-        processor_class: "transformers.AutoProcessor"
-    - repo: "Qwen/Qwen2-VL-2B-Instruct"
-      branch: ['main']
-      quant: [16.0]
-      backend: transformers
-      transformers_args:
-        model_class: "transformers.Qwen2VLForConditionalGeneration"
-        tokenizer_class: "transformers.AutoTokenizer"
-        processor_class: "transformers.AutoProcessor"
+    - repo: "turboderp/Qwen2-VL-2B-Instruct-exl2"
+      branch: ['3.0bpw', '3.5bpw', '4.0bpw', '4.5bpw', '5.0bpw', '6.0bpw', '8.0bpw']
+      quant: [3.0, 3.5, 4.0, 4.5, 5.0, 6.0, 8.0]
+      backend: exllama
+#    - repo: "Qwen/Qwen2-VL-2B-Instruct-AWQ"
+#      branch: ['main']
+#      quant: [4.0]
+#      backend: transformers
+#      transformers_args:
+#        model_class: "transformers.Qwen2VLForConditionalGeneration"
+#        tokenizer_class: "transformers.AutoTokenizer"
+#        processor_class: "transformers.AutoProcessor"
+#    - repo: "Qwen/Qwen2-VL-2B-Instruct"
+#      branch: ['main']
+#      quant: [16.0]
+#      backend: transformers
+#      transformers_args:
+#        model_class: "transformers.Qwen2VLForConditionalGeneration"
+#        tokenizer_class: "transformers.AutoTokenizer"
+#        processor_class: "transformers.AutoProcessor"
 qwen-2-VL-7B:
   default_quant: 4.0
   default_cache_quant: Q4
   prompt_template: Qwen2-VL
   repo:
-    - repo: "Qwen/Qwen2-VL-7B-Instruct-AWQ"
-      branch: ['main']
-      quant: [4.0]
-      backend: transformers
-      transformers_args:
-        model_class: "transformers.Qwen2VLForConditionalGeneration"
-        tokenizer_class: "transformers.AutoTokenizer"
-        processor_class: "transformers.AutoProcessor"
-    - repo: "Qwen/Qwen2-VL-7B-Instruct"
-      branch: ['main']
-      quant: [16.0]
-      backend: transformers
-      transformers_args:
-        model_class: "transformers.Qwen2VLForConditionalGeneration"
-        tokenizer_class: "transformers.AutoTokenizer"
-        processor_class: "transformers.AutoProcessor"
+    - repo: "turboderp/Qwen2-VL-7B-Instruct-exl2"
+      branch: ['3.0bpw', '3.5bpw', '4.0bpw', '4.5bpw', '5.0bpw', '6.0bpw', '8.0bpw']
+      quant: [3.0, 3.5, 4.0, 4.5, 5.0, 6.0, 8.0]
+      backend: exllama
+#    - repo: "Qwen/Qwen2-VL-7B-Instruct-AWQ"
+#      branch: ['main']
+#      quant: [4.0]
+#      backend: transformers
+#      transformers_args:
+#        model_class: "transformers.Qwen2VLForConditionalGeneration"
+#        tokenizer_class: "transformers.AutoTokenizer"
+#        processor_class: "transformers.AutoProcessor"
+#    - repo: "Qwen/Qwen2-VL-7B-Instruct"
+#      branch: ['main']
+#      quant: [16.0]
+#      backend: transformers
+#      transformers_args:
+#        model_class: "transformers.Qwen2VLForConditionalGeneration"
+#        tokenizer_class: "transformers.AutoTokenizer"
+#        processor_class: "transformers.AutoProcessor"
 qwen-2-VL-72B:
   default_quant: 4.0
   default_cache_quant: Q4
   prompt_template: Qwen2-VL
   repo:
-    - repo: "Qwen/Qwen2-VL-72B-Instruct-AWQ"
-      branch: ['main']
-      quant: [4.0]
-      backend: transformers
-      transformers_args:
-        model_class: "transformers.Qwen2VLForConditionalGeneration"
-        tokenizer_class: "transformers.AutoTokenizer"
-        processor_class: "transformers.AutoProcessor"
-    - repo: "Qwen/Qwen2-VL-72B-Instruct-AWQ"
-      branch: ['main']
-      quant: [16.0]
-      backend: transformers
-      transformers_args:
-        model_class: "transformers.Qwen2VLForConditionalGeneration"
-        tokenizer_class: "transformers.AutoTokenizer"
-        processor_class: "transformers.AutoProcessor"
+    - repo: "turboderp/Qwen2-VL-7B-Instruct-exl2"
+      branch: ['3.0bpw', '3.5bpw', '4.0bpw', '4.5bpw', '5.0bpw', '6.0bpw', '8.0bpw']
+      quant: [3.0, 3.5, 4.0, 4.5, 5.0, 6.0, 8.0]
+      backend: exllama
+#    - repo: "Qwen/Qwen2-VL-72B-Instruct-AWQ"
+#      branch: ['main']
+#      quant: [4.0]
+#      backend: transformers
+#      transformers_args:
+#        model_class: "transformers.Qwen2VLForConditionalGeneration"
+#        tokenizer_class: "transformers.AutoTokenizer"
+#        processor_class: "transformers.AutoProcessor"
+#    - repo: "Qwen/Qwen2-VL-72B-Instruct-AWQ"
+#      branch: ['main']
+#      quant: [16.0]
+#      backend: transformers
+#      transformers_args:
+#        model_class: "transformers.Qwen2VLForConditionalGeneration"
+#        tokenizer_class: "transformers.AutoTokenizer"
+#        processor_class: "transformers.AutoProcessor"
 
 # qwen 2.5 Coder
 qwen-2.5-Coder-32B: