diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md index 8900ea9fd9b..f69dfd4f6cc 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/README.md @@ -49,7 +49,7 @@ python run_clm_no_trainer.py \ ``` ### LLAMA2-7b/13b/70b ->Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy. +>Note: LLAMA requires IPEX requirements >= 2.1 to get better accuracy. LLAMA requires transformers < 4.48.0. #### Quantization ```bash diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt index 0bb87a2a5b2..190b531c7b7 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt @@ -2,8 +2,8 @@ accelerate protobuf sentencepiece != 0.1.92 datasets >= 1.1.3 -torch >= 1.10 -transformers < 4.48.0 # TODO: ILITV-3858 +torch == 2.7.0 +transformers pytest wandb einops @@ -11,4 +11,4 @@ neural-compressor lm_eval <= 0.4.7 peft optimum-intel -intel_extension_for_pytorch +intel_extension_for_pytorch == 2.7.0 diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py index 6d9d8947c79..d92c925448f 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py @@ -30,6 +30,7 @@ "--approach", type=str, default="static", help="Select from ['dynamic', 'static', 'weight-only']" ) parser.add_argument("--optimized", action="store_true") +parser.add_argument("--autotune", action="store_true", help="Use autotune to find the best alpha for SmoothQuant.") parser.add_argument("--ipex", action="store_true", help="Use intel extension for pytorch.") parser.add_argument("--load", action="store_true", help="Load quantized model.") parser.add_argument("--accuracy", action="store_true") @@ -204,15 +205,25 @@ def eval_func(model): example_inputs = get_example_inputs(user_model, calib_dataloader) - from neural_compressor.torch.quantization import SmoothQuantConfig, autotune, TuningConfig - tune_config = TuningConfig(config_set=SmoothQuantConfig.get_config_set_for_tuning()) - user_model = autotune( - user_model, - tune_config=tune_config, - eval_fn=eval_func, - run_fn=run_fn, - example_inputs=example_inputs, - ) + if args.autotune: + from neural_compressor.torch.quantization import SmoothQuantConfig, autotune, TuningConfig + tune_config = TuningConfig(config_set=SmoothQuantConfig.get_config_set_for_tuning()) + user_model = autotune( + user_model, + tune_config=tune_config, + eval_fn=eval_func, + run_fn=run_fn, + example_inputs=example_inputs, + ) + else: + from neural_compressor.torch.quantization import SmoothQuantConfig, prepare, convert + args.alpha = eval(args.alpha) + excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] + quant_config = SmoothQuantConfig(alpha=args.alpha, folding=False, excluded_precisions=excluded_precisions) + + user_model = prepare(model=user_model, quant_config=quant_config, example_inputs=example_inputs) + run_fn(user_model) + user_model = convert(user_model) user_model.save(args.output_dir) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh index 774bb73b6f1..cde7b0bc7f7 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_quant.sh @@ -47,10 +47,10 @@ function run_tuning { extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5" elif [ "${topology}" = "llama2_7b_ipex_sq" ]; then model_name_or_path="meta-llama/Llama-2-7b-hf" - extra_cmd=$extra_cmd" --ipex --sq --alpha 0.8" + extra_cmd=$extra_cmd" --ipex --sq --alpha 0.65" elif [ "${topology}" = "gpt_j_ipex_sq" ]; then model_name_or_path="EleutherAI/gpt-j-6b" - extra_cmd=$extra_cmd" --ipex --sq --alpha 1.0" + extra_cmd=$extra_cmd" --ipex --sq --alpha 0.5" fi python -u run_clm_no_trainer.py \