oobabooga
diff --git a/‎.github/workflows/build-wheels-release-linux.yml
+347 b/‎.github/workflows/build-wheels-release-linux.yml
+347
diff --git a/‎.github/workflows/build-wheels-release-rocm62.yml
+347 b/‎.github/workflows/build-wheels-release-rocm62.yml
+347
diff --git a/‎examples/chat.py
+1-1 b/‎examples/chat.py
+1-1
diff --git a/‎examples/chat_prompts.py
+37 b/‎examples/chat_prompts.py
+37
diff --git a/‎examples/test_image_1.jpg ‎examples/media/test_image_1.jpg b/‎examples/test_image_1.jpg ‎examples/media/test_image_1.jpg
diff --git a/‎examples/test_image_2.jpg ‎examples/media/test_image_2.jpg b/‎examples/test_image_2.jpg ‎examples/media/test_image_2.jpg
diff --git a/‎examples/media/test_video_01.png
72.5 KB b/‎examples/media/test_video_01.png
72.5 KB
diff --git a/‎examples/media/test_video_02.png
73.1 KB b/‎examples/media/test_video_02.png
73.1 KB
diff --git a/‎examples/media/test_video_03.png
76.6 KB b/‎examples/media/test_video_03.png
76.6 KB
diff --git a/‎examples/media/test_video_04.png
77.9 KB b/‎examples/media/test_video_04.png
77.9 KB
diff --git a/‎examples/media/test_video_05.png
79.4 KB b/‎examples/media/test_video_05.png
79.4 KB
diff --git a/‎examples/media/test_video_06.png
79.3 KB b/‎examples/media/test_video_06.png
79.3 KB
diff --git a/‎examples/media/test_video_07.png
81.4 KB b/‎examples/media/test_video_07.png
81.4 KB
diff --git a/‎examples/media/test_video_08.png
82 KB b/‎examples/media/test_video_08.png
82 KB
diff --git a/‎examples/media/test_video_09.png
82 KB b/‎examples/media/test_video_09.png
82 KB
diff --git a/‎examples/media/test_video_10.png
84.5 KB b/‎examples/media/test_video_10.png
84.5 KB
diff --git a/‎examples/media/test_video_11.png
86 KB b/‎examples/media/test_video_11.png
86 KB
diff --git a/‎examples/media/test_video_12.png
86.2 KB b/‎examples/media/test_video_12.png
86.2 KB
diff --git a/‎examples/media/test_video_13.png
86.6 KB b/‎examples/media/test_video_13.png
86.6 KB
diff --git a/‎examples/media/test_video_14.png
87 KB b/‎examples/media/test_video_14.png
87 KB
diff --git a/‎examples/media/test_video_15.png
87.8 KB b/‎examples/media/test_video_15.png
87.8 KB
diff --git a/‎examples/media/test_video_16.png
87.3 KB b/‎examples/media/test_video_16.png
87.3 KB
diff --git a/‎examples/media/test_video_17.png
87.3 KB b/‎examples/media/test_video_17.png
87.3 KB
diff --git a/‎examples/media/test_video_18.png
87.2 KB b/‎examples/media/test_video_18.png
87.2 KB
diff --git a/‎examples/media/test_video_19.png
89.8 KB b/‎examples/media/test_video_19.png
89.8 KB
diff --git a/‎examples/media/test_video_20.png
90.9 KB b/‎examples/media/test_video_20.png
90.9 KB
diff --git a/‎examples/media/test_video_21.png
90.1 KB b/‎examples/media/test_video_21.png
90.1 KB
diff --git a/‎examples/media/test_video_22.png
91 KB b/‎examples/media/test_video_22.png
91 KB
diff --git a/‎examples/multimodal.py
+6-3 b/‎examples/multimodal.py
+6-3
diff --git a/‎examples/multimodal_video.py
+149 b/‎examples/multimodal_video.py
+149
diff --git a/‎exllamav2/architecture.py
+34-1 b/‎exllamav2/architecture.py
+34-1
diff --git a/‎exllamav2/attn.py
+3-1 b/‎exllamav2/attn.py
+3-1
diff --git a/‎exllamav2/config.py
+17-6 b/‎exllamav2/config.py
+17-6
diff --git a/‎exllamav2/device.py
+1-8 b/‎exllamav2/device.py
+1-8
@@ -188,7 +188,7 @@ def format_prompt(user_prompt, first):
     global system_prompt, prompt_format
 
     if first:
-        return prompt_format.first_prompt(not system_prompt) \
+        return prompt_format.first_prompt(bool(system_prompt)) \
             .replace("<|system_prompt|>", system_prompt) \
             .replace("<|user_prompt|>", user_prompt)
     else:
 
@@ -547,6 +547,42 @@ def print_extra_newline(self):
         return True
 
 
+class PromptFormat_granite3(PromptFormat):
+    description = "Granite 3"
+
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def default_system_prompt(self):
+        return "You are Granite, developed by IBM. You are a helpful AI assistant."
+
+    def first_prompt(self, sysprompt):
+        r = ""
+        if sysprompt:
+            r += """<|start_of_role|>system<|end_of_role|>|system_prompt|><|end_of_text|>"""
+        r += """<|start_of_role|>user<|end_of_role|><|user_prompt|><|end_of_text|>"""
+        r += """<|start_of_role|>assistant<|end_of_role|>"""
+        return r
+
+    def subs_prompt(self):
+        r = ""
+        r += """<|start_of_role|>user<|end_of_role|><|user_prompt|><|end_of_text|>"""
+        r += """<|start_of_role|>assistant<|end_of_role|>"""
+        return r
+
+    def stop_conditions(self, tokenizer):
+        return [
+            tokenizer.eos_token_id,
+        ]
+
+    def encoding_options(self):
+        return True, False, True
+
+    def print_extra_newline(self):
+        return True
+
+
 class PromptFormat_cohere(PromptFormat):
     description = "Cohere"
 
@@ -610,4 +646,5 @@ def print_extra_newline(self):
     "cohere": PromptFormat_cohere,
     "phi3": PromptFormat_phi3,
     "granite": PromptFormat_granite,
+    "granite3": PromptFormat_granite3,
 }
@@ -18,6 +18,9 @@
 from PIL import Image
 import requests
 
+import torch
+torch.set_printoptions(precision = 5, sci_mode = False, linewidth=200)
+
 # Models used:
 #
 # Pixtral:
@@ -39,8 +42,8 @@
     model_directory = "/mnt/str/models/qwen2-vl-7b-instruct-exl2/6.0bpw"
 
 images = [
-    {"file": "test_image_1.jpg"},
-    {"file": "test_image_2.jpg"},
+    {"file": "media/test_image_1.jpg"},
+    {"file": "media/test_image_2.jpg"},
     # {"url": "https://media.istockphoto.com/id/1212540739/photo/mom-cat-with-kitten.jpg?s=612x612&w=0&k=20&c=RwoWm5-6iY0np7FuKWn8FTSieWxIoO917FF47LfcBKE="},
     # {"url": "https://i.dailymail.co.uk/1s/2023/07/10/21/73050285-12283411-Which_way_should_I_go_One_lady_from_the_US_shared_this_incredibl-a-4_1689019614007.jpg"},
     # {"url": "https://images.fineartamerica.com/images-medium-large-5/metal-household-objects-trevor-clifford-photography.jpg"}
@@ -127,7 +130,7 @@ def get_image(file = None, url = None):
         "<|im_start|>user\n" +
         placeholders +
         instruction +
-        "\n" +
+        "<|im_end|>\n" +
         "<|im_start|>assistant\n"
     )
 
 
@@ -0,0 +1,149 @@
+import sys, os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from exllamav2 import (
+    ExLlamaV2,
+    ExLlamaV2Config,
+    ExLlamaV2Cache,
+    ExLlamaV2Tokenizer,
+    ExLlamaV2VisionTower,
+)
+
+from exllamav2.generator import (
+    ExLlamaV2DynamicGenerator,
+    ExLlamaV2DynamicJob,
+    ExLlamaV2Sampler,
+)
+
+from PIL import Image
+import requests, glob
+
+import torch
+torch.set_printoptions(precision = 5, sci_mode = False, linewidth=200)
+
+# Model used:
+#
+# Qwen2-VL:
+#   https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
+#   https://huggingface.co/turboderp/Qwen2-VL-7B-Instruct-exl2
+
+streaming = True
+greedy = True
+
+model_directory = "/mnt/str/models/qwen2-vl-7b-instruct-exl2/6.0bpw"
+images_mask = os.path.join(os.path.dirname(os.path.abspath(__file__)), "media/test_video_*.png")
+
+frames = [
+    {"file": f}
+    for f in sorted(glob.glob(images_mask))
+]
+
+instruction = "Describe this video."
+
+# Initialize model
+
+config = ExLlamaV2Config(model_directory)
+config.max_seq_len = 16384  # Pixtral default is 1M
+
+# Load vision model and multimodal projector and initialize preprocessor
+
+vision_model = ExLlamaV2VisionTower(config)
+vision_model.load(progress = True)
+
+# Load EXL2 model
+
+model = ExLlamaV2(config)
+cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = 16384)
+model.load_autosplit(cache, progress = True)
+tokenizer = ExLlamaV2Tokenizer(config)
+
+# Create generator
+
+generator = ExLlamaV2DynamicGenerator(
+    model = model,
+    cache = cache,
+    tokenizer = tokenizer,
+)
+
+# Util function to get a PIL image from a URL or from a file in the script's directory
+
+def get_image(file = None, url = None):
+    assert (file or url) and not (file and url)
+    if file:
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        file_path = os.path.join(script_dir, file)
+        return Image.open(file_path)
+    elif url:
+        return Image.open(requests.get(url, stream = True).raw)
+
+# Convert video to embeddings. Aliases can be given explicitly with the text_alias argument, but here we
+# use automatically assigned unique identifiers, then concatenate them into a string
+
+video_embedding = vision_model.get_video_embeddings(
+    model = model,
+    tokenizer = tokenizer,
+    video = [get_image(**img_args) for img_args in frames],
+)
+video_embeddings = [video_embedding]
+
+# Define prompt
+
+prompt = (
+        "<|im_start|>system\n" +
+        "You are a helpful assistant.<|im_end|>\n" +
+        "<|im_start|>user\n" +
+        video_embedding.text_alias +
+        # "\n" +
+        instruction +
+        "<|im_end|>\n" +
+        "<|im_start|>assistant\n"
+)
+
+# Generate
+
+if streaming:
+
+    input_ids = tokenizer.encode(
+        prompt,
+        # add_bos = True,
+        encode_special_tokens = True,
+        embeddings = video_embeddings,
+    )
+
+    job = ExLlamaV2DynamicJob(
+        input_ids = input_ids,
+        max_new_tokens = 500,
+        decode_special_tokens = True,
+        stop_conditions = [tokenizer.eos_token_id],
+        gen_settings = ExLlamaV2Sampler.Settings.greedy() if greedy else None,
+        embeddings = video_embeddings,
+    )
+
+    generator.enqueue(job)
+
+    print()
+    print(prompt, end = ""); sys.stdout.flush()
+
+    eos = False
+    while generator.num_remaining_jobs():
+        results = generator.iterate()
+        for result in results:
+            text = result.get("text", "")
+            print(text, end = ""); sys.stdout.flush()
+
+    print()
+
+else:
+
+    output = generator.generate(
+        prompt = prompt,
+        max_new_tokens = 500,
+        add_bos = True,
+        encode_special_tokens = True,
+        decode_special_tokens = True,
+        stop_conditions = [tokenizer.eos_token_id],
+        gen_settings = ExLlamaV2Sampler.Settings.greedy() if greedy else None,
+        embeddings = video_embeddings,
+    )
+
+    print(output)
@@ -312,7 +312,7 @@ class Params:
             })
             self.mmp.mlp_gate = False
             self.mmp.mlp_act_func = "gelu"
-            self.mmp.mlp_bias = True
+            self.mmp.mlp_bias = bool(read_config.get("multimodal_projector_bias", True))
 
         # Yi
 
@@ -515,6 +515,28 @@ class Params:
             self.lm.parallel_decoder_blocks = True
             self.lm.requires_bos = True
 
+        # Cohere 2
+
+        if arch_string == "Cohere2ForCausalLM":
+            arch_recognized = True
+            self.lm.layer_keys += \
+                layer_keys_cohere_norms + \
+                layer_keys_llama_attn + \
+                layer_keys_llama_mlp
+            self.lm.expect_keys += \
+                expect_keys_gemma
+            self.lm.keys.update({
+                "norm_eps": "layer_norm_eps",
+                "lm_head": "model.embed_tokens",
+                "norm_1": ".input_layernorm",
+                "norm_2": None,
+            })
+            self.lm.norm = "layernorm"
+            self.lm.rope_style = RopeStyle.GPTJ
+            self.lm.parallel_decoder_blocks = True
+            self.lm.requires_bos = True
+            self.lm.alternating_swa = True
+
         # DBRX
 
         if arch_string == "DbrxForCausalLM":
@@ -659,6 +681,17 @@ class Params:
             self.lm.expect_keys += \
                 expect_keys_llama
 
+        # Granite (v3)
+
+        if arch_string == "GraniteForCausalLM":
+            arch_recognized = True
+            self.lm.layer_keys += \
+                layer_keys_llama_norms + \
+                layer_keys_llama_attn + \
+                layer_keys_llama_mlp
+            self.lm.expect_keys += \
+                expect_keys_llama
+
         # Llama (default + fallback)
 
         if arch_string != "LlamaForCausalLM" and not arch_recognized:
 
@@ -211,7 +211,9 @@ def __init__(
         if cfg.use_qk_norm:
             self.submodules += [self.q_norm, self.k_norm]
 
-        if cfg.query_pre_attn_scalar:
+        if cfg.attention_multiplier:
+            self.scaling = cfg.attention_multiplier
+        elif cfg.query_pre_attn_scalar:
             self.scaling = cfg.query_pre_attn_scalar ** (-0.5)
         else:
             self.scaling = 1 / math.sqrt(self.head_dim)
 
@@ -115,6 +115,7 @@ class ExLlamaV2Config:
     final_logit_softcapping: float | None
     attn_logit_softcapping: float | None
     sliding_window: int
+    sliding_window_pattern: int
     norm_head: int | None
     l3_rope_factor: float | None
     l3_rope_low_freq_factor: float | None
@@ -125,6 +126,7 @@ class ExLlamaV2Config:
     checkpoint_fused_mlp: bool
     checkpoint_offset_qzeros: bool
     mrope_section: list | None
+    attention_multiplier: float | None
 
     vision_model_type: str | None
     vision_head_dim: int | None
@@ -288,6 +290,7 @@ def prepare(self, no_tensors: bool = False):
         self.use_qk_norm = read(read_config, bool, ["use_qk_norm"], False)
 
         self.query_pre_attn_scalar = read(read_config, float, "query_pre_attn_scalar", None)
+        self.attention_multiplier = read(read_config, float, "attention_multiplier", None)
 
         # MLP params
 
@@ -313,11 +316,17 @@ def prepare(self, no_tensors: bool = False):
             dim_model_base = read(read_config, int, "dim_model_base", self.hidden_size)
             self.logit_scale /= (self.hidden_size / dim_model_base)
 
-        self.scale_emb = read(read_config, float, "scale_emb", 1)
+        logit_scaling = read(read_config, float, "logits_scaling", None)  # Granite is backwards
+        if logit_scaling:
+            self.logit_scale = 1.0 / logit_scaling
+
+        self.scale_emb = read(read_config, float, ["scale_emb", "embedding_multiplier"], 1)
+        residual_multiplier = read(read_config, float, "residual_multiplier", None)
         scale_depth = read(read_config, float, "scale_depth", None)
-        if scale_depth is None:
-            self.scale_depth = 1
-        else:
+        self.scale_depth = 1
+        if residual_multiplier:
+            self.scale_depth = residual_multiplier
+        elif scale_depth:
             self.scale_depth = scale_depth / math.sqrt(self.num_hidden_layers)
 
         self.attn_logit_softcapping = read(read_config, float, "attn_logit_softcapping", None)
@@ -347,6 +356,7 @@ def prepare(self, no_tensors: bool = False):
         self.original_max_seq_len = self.max_seq_len
 
         self.sliding_window = read(read_config, int, ["sliding_window", "sliding_window_size"], 0, opt_subkey = "text_config")
+        self.sliding_window_pattern = read(read_config, int, ["sliding_window_pattern"], 1)
 
         rs = read(read_config, dict, "rope_scaling", None)
         if rs:
@@ -476,13 +486,14 @@ def check_keys(archparams, prefix):
             self.vision_num_attention_heads = read(read_config, int, ["vision_config->num_attention_heads"], no_default)
             self.vision_num_key_value_heads = read(read_config, int, ["vision_config->num_key_value_heads"], self.vision_num_attention_heads)
             self.vision_num_key_value_groups = self.vision_num_attention_heads // self.vision_num_key_value_heads
+            self.multimodal_projector_bias = read(read_config, bool, ["multimodal_projector_bias"], True)
 
             self.vision_hidden_act = read(read_config, str, ["vision_config->hidden_act"], no_default)
-            self.vision_hidden_size = read(read_config, int, ["vision_config->image_size"], no_default)
+            self.vision_hidden_size = read(read_config, int, ["vision_config->hidden_size"], 1024)
             patch_size = read(read_config, int, ["vision_config->patch_size"], no_default)
             self.vision_rope_theta = read(read_config, int, ["vision_config->rope_theta"], no_default)
             self.vision_feature_layer = read(read_config, int, ["vision_feature_layer"], no_default)
-            self.vision_num_layers = 24
+            self.vision_num_layers = read(read_config, int, ["vision_config->num_hidden_layers"], 24)
             self.vision_intermediate_size = read(read_config, int, ["vision_config->intermediate_size"], self.hidden_size)
 
             image_processor_type = read(read_prep_config, str, ["image_processor_type"], no_default)
 
@@ -123,20 +123,13 @@ def prepare_sincos(self):
             self.cos = self.sin
             return
 
-        base = cfg.rotary_embedding_base
-        alpha = cfg.scale_alpha_value or 1.0
-        scale = cfg.scale_pos_emb or 1.0
-
-        # Alpha scaling for any rope_scaling type
-
-        if alpha != 1.0: base *= alpha ** (cfg.head_dim / (cfg.head_dim - 2))
-
         # RoPE params
 
         inv_freq, scaling_factor = rope.get_rope_params(device, cfg)
 
         # Common
 
+        scale = cfg.scale_pos_emb or 1.0
         t = torch.arange(cfg.max_seq_len, device = device, dtype = torch.float32)
         if scale != 1.0: t /= scale