Support quant mllm (#2177)

Jintao-Huang · web-flow · commit 9989ef9ece11 · 2024-10-03T15:54:42.000+08:00
diff --git a/swift/llm/export.py b/swift/llm/export.py
@@ -1,22 +1,34 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
-from typing import List, Optional
+from contextlib import contextmanager
+from types import MethodType
+from typing import Dict, List, Optional
 
 import json
 import torch
+import torch.nn as nn
 
 from swift.llm import get_model_tokenizer, get_template
 from swift.utils import (check_json_format, get_logger, get_main, get_model_info, push_to_ms_hub, seed_everything,
                          show_layers)
 from .infer import merge_lora, prepare_model_template, save_checkpoint
-from .utils import ExportArguments, Template, get_dataset, swift_to_peft_format
+from .utils import ExportArguments, Template, deep_getattr, get_dataset, get_mllm_arch, swift_to_peft_format
 
 logger = get_logger()
 
 _args: Optional[ExportArguments] = None
 template: Optional[Template] = None
 
 
+def _prepare_dataset(examples: List[Dict[str, torch.LongTensor]], batch_size: int = 1, *args, **kwargs):
+    global _args, template
+    assert template is not None
+    examples = [
+        template.data_collator(examples[start:start + batch_size]) for start in range(0, len(examples), batch_size)
+    ]
+    return examples
+
+
 def _get_dataset(*args, **kwargs):
     global _args, template
     assert _args is not None
@@ -39,27 +51,31 @@ def _get_dataset(*args, **kwargs):
     samples = []
     n_run = 0
     for data in dataset:
-        input_ids = template.encode(data)[0].get('input_ids')
+        inputs = template.encode(data)[0]
+        input_ids = inputs['input_ids']
         if input_ids is None or len(input_ids) == 0:
             continue
-        sample = torch.tensor(input_ids)
-        samples.append(sample)
+        if _args.is_multimodal and _args.quant_method == 'gptq':
+            inputs.pop('labels', None)
+            samples.append(inputs)
+        else:
+            samples += input_ids
         n_run += 1
         if n_run == n_samples:
             break
+    if _args.is_multimodal and _args.quant_method == 'gptq':
+        return samples
     # now concatenate all samples and split according to block size
-    cat_samples = torch.cat(samples, dim=0)  # shape: [X]
-    n_split = cat_samples.shape[0] // block_size
+    n_split = len(samples) // block_size
     logger.info(f'Split into {n_split} blocks')
-    if _args.quant_method == 'awq':
-        return [cat_samples[None, i * block_size:(i + 1) * block_size] for i in range(n_split)]
-    else:  # gptq
-        res = []
-        for i in range(n_split):
-            input_ids = cat_samples[None, i * block_size:(i + 1) * block_size]
-            attention_mask = torch.ones_like(input_ids)
-            res.append({'input_ids': input_ids, 'attention_mask': attention_mask})
-        return res
+    res = []
+    for i in range(n_split):
+        input_ids = samples[i * block_size:(i + 1) * block_size]
+        if _args.quant_method == 'awq':
+            res.append(torch.tensor(input_ids)[None])
+        else:
+            res.append({'input_ids': input_ids})
+    return res
 
 
 def awq_model_quantize(awq_model, tokenizer, batch_size) -> None:
@@ -80,22 +96,74 @@ def awq_model_quantize(awq_model, tokenizer, batch_size) -> None:
         bits=_args.quant_bits, group_size=group_size, zero_point=True, version='GEMM')
 
 
+@contextmanager
+def _patch_gptq():
+    from optimum.gptq import quantizer
+    _get_dataset_origin = quantizer.get_dataset
+    _prepare_dataset_origin = quantizer.prepare_dataset
+    quantizer.get_dataset = _get_dataset
+    quantizer.prepare_dataset = _prepare_dataset
+    yield
+    quantizer.get_dataset = _get_dataset_origin
+    quantizer.prepare_dataset = _prepare_dataset_origin
+
+
+def _patch_model_forward(module_list):
+
+    def _new_forward(self, *args, **kwargs):
+        if 'use_cache' in kwargs:
+            kwargs['use_cache'] = False
+        layer_ret = self.__old_forward(*args, **kwargs)
+        return layer_ret + args[len(layer_ret):]
+
+    for module in module_list:
+        if hasattr(module, '_old_forward'):  # device_map
+            __old_forward = module._old_forward
+            module._old_forward = MethodType(_new_forward, module)
+        else:
+            __old_forward = module.forward
+            module.forward = MethodType(_new_forward, module)
+        module.__old_forward = __old_forward
+
+
+def get_block_name_to_quantize(model: nn.Module, model_type: str) -> Optional[str]:
+    mllm_arch = get_mllm_arch(model_type)
+    prefix = ''
+    if mllm_arch is not None:
+        assert len(mllm_arch.language_model) == 1, f'mllm_arch.language_model: {mllm_arch.language_model}'
+        prefix = mllm_arch.language_model[0]
+        model = deep_getattr(model, prefix)
+
+    module_lists = []
+    for n, m in model.named_modules():
+        if isinstance(m, nn.ModuleList) and len(m) >= 10:
+            module_lists.append((n, m))
+    if module_lists:
+        module_list = max(module_lists, key=lambda x: len(x[1]))
+        _patch_model_forward(module_list[1])
+        return f'{prefix}.{module_list[0]}'
+
+
 def gptq_model_quantize(model, tokenizer, batch_size):
-    from optimum.gptq import GPTQQuantizer, quantizer
+    from optimum.gptq import GPTQQuantizer
     global _args
     logger.info(f'Quantization dataset: {_args.dataset}')
-    gptq_quantizer = GPTQQuantizer(bits=_args.quant_bits, dataset=','.join(_args.dataset), batch_size=batch_size)
-    _origin_get_dataset = quantizer.get_dataset
-    quantizer.get_dataset = _get_dataset
-    logger.info('Start quantizing the model...')
-    logger.warning('The process of packing the model takes a long time and there is no progress bar. '
-                   'Please be patient and wait...')
-    gptq_quantizer.quantize_model(model, tokenizer)
-    quantizer.get_dataset = _origin_get_dataset  # recover
+    with _patch_gptq():
+        gptq_quantizer = GPTQQuantizer(
+            bits=_args.quant_bits,
+            dataset=','.join(_args.dataset),
+            batch_size=batch_size,
+            block_name_to_quantize=get_block_name_to_quantize(model, _args.model_type))
+        logger.info('Start quantizing the model...')
+        logger.warning('The process of packing the model takes a long time and there is no progress bar. '
+                       'Please be patient and wait...')
+        if not hasattr(model.config, 'use_cache'):
+            model.config.use_cache = None
+        gptq_quantizer.quantize_model(model, tokenizer)
     return gptq_quantizer
 
 
-def replace_and_concat(template: 'Template', template_list: List, placeholder: str, keyword: str):
+def replace_and_concat(template: Template, template_list: List, placeholder: str, keyword: str):
     final_str = ''
     for t in template_list:
         if isinstance(t, str):
diff --git a/swift/llm/utils/client_utils.py b/swift/llm/utils/client_utils.py
@@ -19,7 +19,7 @@
 
 
 def _get_request_kwargs(api_key: Optional[str] = None) -> Dict[str, Any]:
-    timeout = float(os.getenv('TIMEOUT', '300'))
+    timeout = float(os.getenv('TIMEOUT', '1800'))
     request_kwargs = {}
     if timeout > 0:
         request_kwargs['timeout'] = timeout
diff --git a/swift/llm/utils/media.py b/swift/llm/utils/media.py
@@ -94,10 +94,10 @@ def __call__(self, d: Dict[str, Any], medias: Union[tuple, list]) -> None:
             raise NotImplementedError
         else:
             pass
-        standard_tag = self.standard_tags[self.media_type]
 
         all_queries = ''.join([h[0] for h in history]) + query
         if self.media_tag in all_queries:
+            standard_tag = self.standard_tags[self.media_type]
             assert all_queries.count(self.media_tag) == media_cnt
             for h in history:
                 h[0] = h[0].replace(self.media_tag, standard_tag)
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -1045,7 +1045,8 @@ def _output_device_map_hook(module, input, output):
 def get_model_tokenizer_pixtral(model_dir: str, *args, **kwargs):
     from transformers import AutoProcessor, LlavaForConditionalGeneration
     processor = AutoProcessor.from_pretrained(model_dir)
-    kwargs['automodel_class'] = LlavaForConditionalGeneration
+    if 'automodel_class' not in kwargs:
+        kwargs['automodel_class'] = LlavaForConditionalGeneration
     kwargs['tokenizer'] = processor.tokenizer
     model, tokenizer = get_model_tokenizer_from_repo(model_dir, *args, **kwargs)
     tokenizer.processor = processor
@@ -1117,14 +1118,10 @@ def get_model_tokenizer_llava_llama(model_dir: str,
 
     model_config = LlavaConfig.from_pretrained(model_dir)
     processor = AutoProcessor.from_pretrained(model_dir)
+    if 'automodel_class' not in kwargs:
+        kwargs['automodel_class'] = LlavaForConditionalGeneration
     model, tokenizer = get_model_tokenizer_with_flash_attn(
-        model_dir,
-        torch_dtype,
-        model_kwargs,
-        load_model,
-        model_config=model_config,
-        automodel_class=LlavaForConditionalGeneration,
-        **kwargs)
+        model_dir, torch_dtype, model_kwargs, load_model, model_config=model_config, **kwargs)
     tokenizer.processor = processor
     return model, tokenizer
 
@@ -6275,7 +6272,8 @@ def get_model_tokenizer_llama3_2_vision(*args, **kwargs):
     hf_model_id='llava-hf/llava-1.5-7b-hf')
 def get_model_tokenizer_llava_1_5(*args, **kwargs):
     from transformers import LlavaForConditionalGeneration
-    kwargs['automodel_class'] = LlavaForConditionalGeneration
+    if 'automodel_class' not in kwargs:
+        kwargs['automodel_class'] = LlavaForConditionalGeneration
     return get_model_tokenizer_llava_hf(*args, **kwargs)
 
 
@@ -6387,7 +6385,8 @@ def get_model_tokenizer_llava_onevision(*args, **kwargs):
     tags=['multi-modal', 'vision'])
 def get_model_tokenizer_llava_next(*args, **kwargs):
     from transformers import LlavaNextForConditionalGeneration
-    kwargs['automodel_class'] = LlavaNextForConditionalGeneration
+    if 'automodel_class' not in kwargs:
+        kwargs['automodel_class'] = LlavaNextForConditionalGeneration
     return get_model_tokenizer_llava_hf(*args, **kwargs)
 
 
diff --git a/swift/llm/utils/preprocess.py b/swift/llm/utils/preprocess.py
@@ -66,6 +66,8 @@ def new_preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
             column_state = self.column_state
         row = preprocess(self, row)
         for k, v in row.items():
+            if k in ['images', 'videos', 'audios']:
+                continue
             k_i = self.key_mapping[k]
             if column_state[k_i]:
                 continue
@@ -196,11 +198,6 @@ def preprocess(self, d: Dict[str, Any]) -> Dict[str, Any]:
         }
         medias = self.parse_medias(d)
         self.media_replacer(row, medias)
-        if self.media_type:
-            if not isinstance(self.media_key, str):
-                row[self.media_name] = medias
-            else:
-                row[self.media_key] = medias
         return row
 
     def __call__(self, dataset: DATASET_TYPE) -> DATASET_TYPE:
@@ -295,11 +292,6 @@ def preprocess(self, d: Dict[str, Any]) -> Dict[str, Any]:
             })
             medias = self.parse_medias(d)
             self.media_replacer(row, medias)
-            if self.media_type:
-                if not isinstance(self.media_key, str):
-                    row[self.media_name] = medias
-                else:
-                    row[self.media_key] = medias
             return row
         except (AssertionError, SyntaxError) as e:
             logger.error(e)
@@ -355,11 +347,6 @@ def preprocess(self, d: Dict[str, Any]) -> Dict[str, Any]:
             }
             medias = self.parse_medias(d)
             self.media_replacer(row, medias)
-            if self.media_type:
-                if not isinstance(self.media_key, str):
-                    row[self.media_name] = medias
-                else:
-                    row[self.media_key] = medias
         except Exception:
             if self.error_strategy == 'raise':
                 raise ValueError(f'conversations: {conversations}')