|
| 1 | +# An example finetuning Saleforce's XGen-7b model with 8k context using qlora |
| 2 | +# on Tim Dettmer's Guanaco dataset. |
| 3 | +base_model: Salesforce/xgen-7b-8k-base |
| 4 | +base_model_config: Salesforce/xgen-7b-8k-base |
| 5 | +trust_remote_code: true |
| 6 | +model_type: AutoModelForCausalLM |
| 7 | +tokenizer_type: AutoTokenizer |
| 8 | +load_in_8bit: false |
| 9 | +# enable 4bit for QLoRA |
| 10 | +load_in_4bit: true |
| 11 | +gptq: false |
| 12 | +strict: false |
| 13 | +push_dataset_to_hub: |
| 14 | +datasets: |
| 15 | + - path: timdettmers/openassistant-guanaco |
| 16 | + data_files: |
| 17 | + - openassistant_best_replies_train.jsonl |
| 18 | + type: "completion" |
| 19 | +dataset_prepared_path: last_run_prepared |
| 20 | +val_set_size: 0.01 |
| 21 | +# enable QLoRA |
| 22 | +adapter: qlora |
| 23 | +lora_model_dir: |
| 24 | +sequence_len: 8192 |
| 25 | +max_packed_sequence_len: |
| 26 | + |
| 27 | +# hyperparameters from QLoRA paper Appendix B.2 |
| 28 | +# "We find hyperparameters to be largely robust across datasets" |
| 29 | +lora_r: 64 |
| 30 | +lora_alpha: 16 |
| 31 | +# 0.1 for models up to 13B |
| 32 | +# 0.05 for 33B and 65B models |
| 33 | +lora_dropout: 0.05 |
| 34 | +# add LoRA modules on all linear layers of the base model |
| 35 | +lora_target_modules: |
| 36 | +lora_target_linear: true |
| 37 | +lora_fan_in_fan_out: |
| 38 | + |
| 39 | +wandb_project: |
| 40 | +wandb_watch: |
| 41 | +wandb_run_id: |
| 42 | +wandb_log_model: |
| 43 | +output_dir: ./qlora-out |
| 44 | + |
| 45 | +# QLoRA paper Table 9 |
| 46 | +# - 16 for 7b & 13b |
| 47 | +# - 32 for 33b, 64 for 64b |
| 48 | +# Max size tested on A6000 |
| 49 | +# - 7b: 40 |
| 50 | +# - 40b: 4 |
| 51 | +# decrease if OOM, increase for max VRAM utilization |
| 52 | +micro_batch_size: 1 |
| 53 | +gradient_accumulation_steps: 1 |
| 54 | +num_epochs: 3 |
| 55 | +# Optimizer for QLoRA |
| 56 | +optimizer: paged_adamw_32bit |
| 57 | +torchdistx_path: |
| 58 | +lr_scheduler: cosine |
| 59 | +# QLoRA paper Table 9 |
| 60 | +# - 2e-4 for 7b & 13b |
| 61 | +# - 1e-4 for 33b & 64b |
| 62 | +learning_rate: 0.00002 |
| 63 | +train_on_inputs: false |
| 64 | +group_by_length: false |
| 65 | +bf16: true |
| 66 | +fp16: false |
| 67 | +tf32: false |
| 68 | +gradient_checkpointing: true |
| 69 | +# stop training after this many evaluation losses have increased in a row |
| 70 | +# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback |
| 71 | +early_stopping_patience: 3 |
| 72 | +resume_from_checkpoint: |
| 73 | +auto_resume_from_checkpoints: true |
| 74 | +local_rank: |
| 75 | +logging_steps: 1 |
| 76 | +xformers_attention: true |
| 77 | +flash_attention: |
| 78 | +gptq_groupsize: |
| 79 | +gptq_model_v1: |
| 80 | +warmup_steps: 10 |
| 81 | +eval_steps: 50 |
| 82 | +save_steps: 50 |
| 83 | +debug: |
| 84 | +deepspeed: |
| 85 | +weight_decay: 0.0 |
| 86 | +special_tokens: |
| 87 | + eos_token: "<|endoftext|>" |
| 88 | + bos_token: "<|endoftext|>" |
| 89 | + unk_token: "<|endoftext|>" |
| 90 | + pad_token: "<|endoftext|>" |
0 commit comments