rebase main

yihuiwen · yihuiwen · commit e0fdcd7bd4c4 · 2025-04-02T21:47:48.000+08:00
diff --git a/lightllm/models/qwen2_5_vl/qwen2_5_visual.py b/lightllm/models/qwen2_5_vl/qwen2_5_visual.py
@@ -20,6 +20,7 @@
 from transformers import AutoProcessor
 from safetensors import safe_open
 from transformers.utils import TensorType
+from lightllm.server.multimodal_params import MultimodalParams, ImageItem
 from lightllm.models.qwen2_vl.qwen2_visual import PatchEmbed, VisionRotaryEmbedding
 
 # adapted from
@@ -509,17 +510,17 @@ def load_model(self, weight_dir):
 
         self.load_state_dict(weight_dict)
 
-    def encode(self, image_uuids: List):
+    def encode(self, images: List[ImageItem]):
         img_tensors = []
         valid_ids = []
         valid_id = 0
         img_grids = []
         uuids = []
 
-        for i, url in enumerate(image_uuids):
-            if isinstance(url, int):
-                uuids.append(url)
-                image_data = read_shm(get_shm_name_data(url))
+        for i, img in enumerate(images):
+            if isinstance(img, ImageItem):
+                uuids.append(img.uuid)
+                image_data = read_shm(get_shm_name_data(img.uuid))
                 image_data = Image.open(BytesIO(image_data))
                 image_data = get_image(image_data)
                 image_inputs = self.processor.preprocess(images=image_data, return_tensors="pt")
@@ -528,7 +529,7 @@ def encode(self, image_uuids: List):
                 img_tensors.append(pixel_values)
                 img_grids.append(image_grid_thw)
             else:
-                raise Exception("Unsupport input types: {} for {}".format(type(url), url))
+                raise Exception("Unsupport input types: {} for {}".format(type(img), img))
 
             # must devide merge_length
             cur_num = img_tensors[-1].shape[0] // (self.spatial_merge_size ** 2)
diff --git a/lightllm/server/router/model_infer/mode_backend/base_backend.py b/lightllm/server/router/model_infer/mode_backend/base_backend.py
@@ -177,10 +177,7 @@ def init_model(self, kvargs):
                     self.model = Qwen2RewardTpPartModel(model_kvargs)
                 else:
                     self.model = Qwen2TpPartModel(model_kvargs)
-            elif self.model_type == "qwen2_vl":
-                self.model = Qwen2VLTpPartModel(model_kvargs)
-                self.is_multimodal = True
-            elif self.model_type == "qwen2_5_vl":
+            elif self.model_type in ["qwen2_vl", "qwen2_5_vl"]:
                 self.model = Qwen2VLTpPartModel(model_kvargs)
                 self.is_multimodal = True
             elif self.model_type == "gemma":
diff --git a/lightllm/server/tokenizer.py b/lightllm/server/tokenizer.py
@@ -79,12 +79,7 @@ def get_tokenizer(
         tokenizer = LlavaTokenizer(tokenizer, model_cfg)
     elif model_type == "qwen" and "visual" in model_cfg:
         tokenizer = QWenVLTokenizer(tokenizer, model_cfg)
-    elif model_type == "qwen2_vl" and "vision_config" in model_cfg:
-        from transformers import AutoProcessor
-
-        image_processor = AutoProcessor.from_pretrained(tokenizer_name)
-        tokenizer = QWen2VLTokenizer(tokenizer=tokenizer, image_processor=image_processor, model_cfg=model_cfg)
-    elif model_type == "qwen2_5_vl" and "vision_config" in model_cfg:
+    elif model_type in ["qwen2_vl", "qwen2_5_vl"] and "vision_config" in model_cfg:
         from transformers import AutoProcessor
 
         image_processor = AutoProcessor.from_pretrained(tokenizer_name)