From e25773479884a83fe2b34886ebbdf458a1aab1d8 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 8 Oct 2024 17:02:47 +0800 Subject: [PATCH 1/5] update --- swift/llm/export.py | 6 ++--- swift/llm/infer.py | 53 ++++++++++++++++++++----------------- swift/llm/utils/template.py | 4 +++ 3 files changed, 35 insertions(+), 28 deletions(-) diff --git a/swift/llm/export.py b/swift/llm/export.py index 3b04f5f764..ca23479999 100644 --- a/swift/llm/export.py +++ b/swift/llm/export.py @@ -255,18 +255,18 @@ def llm_export(args: ExportArguments) -> None: if args.quant_method == 'awq': from awq import AutoAWQForCausalLM model, template = prepare_model_template( - args, device_map=args.quant_device_map, verbose=False, automodel_class=AutoAWQForCausalLM) + args, device_map=args.quant_device_map, export_mode=True, automodel_class=AutoAWQForCausalLM) awq_model_quantize(model, template.tokenizer, args.quant_batch_size) model.save_quantized(args.quant_output_dir) elif args.quant_method == 'gptq': - model, template = prepare_model_template(args, device_map=args.quant_device_map, verbose=False) + model, template = prepare_model_template(args, device_map=args.quant_device_map, export_mode=True) gptq_quantizer = gptq_model_quantize(model, template.tokenizer, args.quant_batch_size) model.config.quantization_config.pop('dataset', None) gptq_quantizer.save(model, args.quant_output_dir) elif args.quant_method == 'bnb': args.quantization_bit = args.quant_bits args.bnb_4bit_compute_dtype, args.load_in_4bit, args.load_in_8bit = args.select_bnb() - model, template = prepare_model_template(args, device_map=args.quant_device_map, verbose=False) + model, template = prepare_model_template(args, device_map=args.quant_device_map, export_mode=True) model.save_pretrained(args.quant_output_dir) else: raise ValueError(f'args.quant_method: {args.quant_method}') diff --git a/swift/llm/infer.py b/swift/llm/infer.py index 3e1a1439ed..3b5d3aaf75 100644 --- a/swift/llm/infer.py +++ b/swift/llm/infer.py @@ -109,7 +109,7 @@ def merge_lora(args: InferArguments, if device_map is None: device_map = args.merge_device_map logger.info(f'merge_device_map: {device_map}') - model, template = prepare_model_template(args, device_map=device_map, verbose=False) + model, template = prepare_model_template(args, device_map=device_map, export_mode=True) logger.info('Merge LoRA...') Swift.merge_and_unload(model) model = model.model @@ -130,11 +130,12 @@ def merge_lora(args: InferArguments, return merged_lora_path -def prepare_model_template(args: InferArguments, - *, - device_map: Optional[str] = None, - verbose: bool = True, - automodel_class=None) -> Tuple[PreTrainedModel, Template]: +def prepare_model_template( + args: InferArguments, + *, + device_map: Optional[str] = None, + export_mode: bool = False, # for inference or export + automodel_class=None) -> Tuple[PreTrainedModel, Template]: from .sft import get_default_device_map if is_torch_npu_available(): print(f'device_count: {torch.npu.device_count()}') @@ -188,25 +189,7 @@ def prepare_model_template(args: InferArguments, revision=args.model_revision, quant_method=args.quant_method, **kwargs) - if verbose: - logger.info(f'model_config: {model.config}') - - generation_config = GenerationConfig( - max_new_tokens=args.max_new_tokens, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - do_sample=args.do_sample, - repetition_penalty=args.repetition_penalty, - num_beams=args.num_beams, - pad_token_id=tokenizer.pad_token_id, - eos_token_id=tokenizer.eos_token_id) - set_generation_config(model, generation_config) - logger.info(f'model.generation_config: {model.generation_config}') - if model.generation_config.num_beams != 1: - args.stream = False - logger.info('Setting args.stream: False') if model.max_model_len is None: model.max_model_len = args.max_model_len elif args.max_model_len is not None: @@ -215,6 +198,26 @@ def prepare_model_template(args: InferArguments, else: raise ValueError('args.max_model_len exceeds the maximum max_model_len supported by the model.' f'args.max_model_len: {args.max_model_len}, model.max_model_len: {model.max_model_len}') + if not export_mode: + logger.info(f'model_config: {model.config}') + generation_config = GenerationConfig( + max_new_tokens=args.max_new_tokens, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + do_sample=args.do_sample, + repetition_penalty=args.repetition_penalty, + num_beams=args.num_beams, + pad_token_id=tokenizer.pad_token_id, + eos_token_id=tokenizer.eos_token_id) + model._generation_config_origin = model.generation_config + set_generation_config(model, generation_config) + logger.info(f'model.generation_config: {model.generation_config}') + + if model.generation_config.num_beams != 1: + args.stream = False + logger.info('Setting args.stream: False') + # Preparing LoRA if is_adapter(args.sft_type) and args.ckpt_dir is not None: if isinstance(args, DeployArguments) and args.lora_request_list is not None: @@ -227,7 +230,7 @@ def prepare_model_template(args: InferArguments, model = model.to(model.dtype) model.requires_grad_(False) - if verbose: + if not export_mode: show_layers(model) logger.info(model) logger.info(get_model_info(model)) diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index ac3b95611e..96c4209db4 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -2028,6 +2028,10 @@ def _post_encode(self, model, data: Any) -> Dict[str, Any]: res['labels'] = labels[0] return res + @staticmethod + def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]: + return generate_ids + register_template(TemplateType.llama3_1_omni, Llama3_1OmniTemplate(), lazy_tokenize=True) From 86fe49fce731a4dc1c6819120211659a1802e484 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 8 Oct 2024 21:06:45 +0800 Subject: [PATCH 2/5] update --- swift/llm/deploy.py | 5 +---- swift/llm/utils/vllm_utils.py | 26 -------------------------- 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/swift/llm/deploy.py b/swift/llm/deploy.py index 24f322c347..a905bab5ee 100644 --- a/swift/llm/deploy.py +++ b/swift/llm/deploy.py @@ -275,7 +275,7 @@ async def inference_vllm_async(request: Union[ChatCompletionRequest, CompletionR request_id = request_info['request_id'] kwargs = {'max_tokens': request.max_tokens} - for key in ['n', 'best_of', 'frequency_penalty', 'length_penalty', 'presence_penalty', 'num_beams']: + for key in ['n', 'best_of', 'frequency_penalty', 'length_penalty', 'presence_penalty']: kwargs[key] = getattr(request, key) for key in ['temperature', 'top_k', 'top_p', 'repetition_penalty']: new_value = getattr(request, key) @@ -292,9 +292,6 @@ async def inference_vllm_async(request: Union[ChatCompletionRequest, CompletionR kwargs['logprobs'] = max(1, request.top_logprobs) generation_config = VllmGenerationConfig(**kwargs) - if generation_config.use_beam_search and request.stream: - error_msg = 'Streaming generation does not support beam search.' - raise ValueError(error_msg) tokenizer = template.tokenizer if tokenizer.eos_token is not None and tokenizer.eos_token not in generation_config.stop: generation_config.stop.append(tokenizer.eos_token) diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py index a7e36e870d..8235fdcf36 100644 --- a/swift/llm/utils/vllm_utils.py +++ b/swift/llm/utils/vllm_utils.py @@ -204,7 +204,6 @@ def __init__( top_k: int = 50, # -1: all top_p: float = 1., repetition_penalty: float = 1., - num_beams: int = 1, *, n: int = 1, logprobs: Optional[int] = None, @@ -218,12 +217,6 @@ def __init__( max_new_tokens = kwargs.pop('max_new_tokens', None) if max_new_tokens is not None: max_tokens = max_new_tokens - if num_beams > 1: - top_k = -1 - top_p = 1 - temperature = 0 - logger.warning('The output of num_beams in vllm may not be consistent with ' - 'the output of num_beams in transformers.') if top_k == 0: top_k = -1 if stop is None: @@ -233,11 +226,6 @@ def __init__( kwargs['top_k'] = top_k kwargs['top_p'] = top_p kwargs['repetition_penalty'] = repetition_penalty - if num_beams > 1: - best_of = kwargs.get('best_of') - assert 'use_beam_search' not in kwargs and best_of is None - kwargs['use_beam_search'] = True - kwargs['best_of'] = num_beams kwargs['n'] = n kwargs['logprobs'] = logprobs kwargs['seed'] = seed @@ -260,7 +248,6 @@ class VllmGenerationConfig(_VllmGenerationConfigMixin, SamplingParams): top_k: int = 50 # -1: all top_p: float = 1. repetition_penalty: float = 1. - num_beams: int = 1 n: int = 1 logprobs: Optional[int] = None seed: Optional[int] = None @@ -269,15 +256,6 @@ class VllmGenerationConfig(_VllmGenerationConfigMixin, SamplingParams): skip_special_tokens: bool = False def __post_init__(self): - if self.num_beams > 1: - self.top_k = -1 - self.top_p = 1 - self.temperature = 0 - logger.warning('The output of num_beams in vllm may not be consistent with ' - 'the output of num_beams in transformers.') - assert self.best_of is None - self.use_beam_search = True - self.best_of = self.num_beams if self.top_k == 0: self.top_k = -1 if self.stop is None: @@ -435,10 +413,6 @@ def inference_stream_vllm( use_tqdm=use_tqdm, **kwargs) - if generation_config.use_beam_search: - error_msg = 'Streaming generation does not support beam search.' - raise ValueError(error_msg) - n_finished = 0 n_steps = 0 if flush_steps is None: From b6b427ac7e1aa55988ffc3d9d026416c813f0151 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 8 Oct 2024 21:21:21 +0800 Subject: [PATCH 3/5] update --- swift/llm/export.py | 6 +++--- swift/llm/infer.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/swift/llm/export.py b/swift/llm/export.py index ca23479999..0f85a7c5e7 100644 --- a/swift/llm/export.py +++ b/swift/llm/export.py @@ -255,18 +255,18 @@ def llm_export(args: ExportArguments) -> None: if args.quant_method == 'awq': from awq import AutoAWQForCausalLM model, template = prepare_model_template( - args, device_map=args.quant_device_map, export_mode=True, automodel_class=AutoAWQForCausalLM) + args, device_map=args.quant_device_map, task='export', automodel_class=AutoAWQForCausalLM) awq_model_quantize(model, template.tokenizer, args.quant_batch_size) model.save_quantized(args.quant_output_dir) elif args.quant_method == 'gptq': - model, template = prepare_model_template(args, device_map=args.quant_device_map, export_mode=True) + model, template = prepare_model_template(args, device_map=args.quant_device_map, task='export') gptq_quantizer = gptq_model_quantize(model, template.tokenizer, args.quant_batch_size) model.config.quantization_config.pop('dataset', None) gptq_quantizer.save(model, args.quant_output_dir) elif args.quant_method == 'bnb': args.quantization_bit = args.quant_bits args.bnb_4bit_compute_dtype, args.load_in_4bit, args.load_in_8bit = args.select_bnb() - model, template = prepare_model_template(args, device_map=args.quant_device_map, export_mode=True) + model, template = prepare_model_template(args, device_map=args.quant_device_map, task='export') model.save_pretrained(args.quant_output_dir) else: raise ValueError(f'args.quant_method: {args.quant_method}') diff --git a/swift/llm/infer.py b/swift/llm/infer.py index 3b5d3aaf75..1a5f51cfcd 100644 --- a/swift/llm/infer.py +++ b/swift/llm/infer.py @@ -109,7 +109,7 @@ def merge_lora(args: InferArguments, if device_map is None: device_map = args.merge_device_map logger.info(f'merge_device_map: {device_map}') - model, template = prepare_model_template(args, device_map=device_map, export_mode=True) + model, template = prepare_model_template(args, device_map=device_map, task='export') logger.info('Merge LoRA...') Swift.merge_and_unload(model) model = model.model @@ -134,7 +134,7 @@ def prepare_model_template( args: InferArguments, *, device_map: Optional[str] = None, - export_mode: bool = False, # for inference or export + task: Literal['infer', 'export'] = 'infer', # for inference or export automodel_class=None) -> Tuple[PreTrainedModel, Template]: from .sft import get_default_device_map if is_torch_npu_available(): @@ -198,7 +198,7 @@ def prepare_model_template( else: raise ValueError('args.max_model_len exceeds the maximum max_model_len supported by the model.' f'args.max_model_len: {args.max_model_len}, model.max_model_len: {model.max_model_len}') - if not export_mode: + if task == 'infer': logger.info(f'model_config: {model.config}') generation_config = GenerationConfig( max_new_tokens=args.max_new_tokens, @@ -230,7 +230,7 @@ def prepare_model_template( model = model.to(model.dtype) model.requires_grad_(False) - if not export_mode: + if task == 'infer': show_layers(model) logger.info(model) logger.info(get_model_info(model)) From b7ca0b94742b44492076b46108074fc886d87076 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 8 Oct 2024 21:26:37 +0800 Subject: [PATCH 4/5] update --- swift/llm/infer.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/swift/llm/infer.py b/swift/llm/infer.py index 1a5f51cfcd..8181a2f47f 100644 --- a/swift/llm/infer.py +++ b/swift/llm/infer.py @@ -130,12 +130,11 @@ def merge_lora(args: InferArguments, return merged_lora_path -def prepare_model_template( - args: InferArguments, - *, - device_map: Optional[str] = None, - task: Literal['infer', 'export'] = 'infer', # for inference or export - automodel_class=None) -> Tuple[PreTrainedModel, Template]: +def prepare_model_template(args: InferArguments, + *, + device_map: Optional[str] = None, + task: Literal['infer', 'export'] = 'infer', + automodel_class=None) -> Tuple[PreTrainedModel, Template]: from .sft import get_default_device_map if is_torch_npu_available(): print(f'device_count: {torch.npu.device_count()}') From da376a13db3602f581b0d43d6995e3e1b85e3376 Mon Sep 17 00:00:00 2001 From: "huangjintao.hjt" Date: Tue, 8 Oct 2024 21:36:23 +0800 Subject: [PATCH 5/5] fix --- swift/llm/utils/template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 96c4209db4..84c01a2b61 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -2646,7 +2646,7 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An videos_path = example.get('videos') or [] if len(videos_path) > 0: video_processor = self.tokenizer.processor.video_processor - video_inputs = video_processor(videos, return_tensors='pt').to(self.model.dtype) + video_inputs = video_processor(videos_path, return_tensors='pt').to(self.model.dtype) inputs['pixel_values_videos'] = video_inputs['pixel_values_videos'] if len(images) > 0: image_processor = self.tokenizer.processor.image_processor