Skip to content

Commit dcdec44

Browse files
authored
Merge pull request axolotl-ai-cloud#306 from ethanhs/xgen
Add XGen info to README and example config
2 parents 3ffb018 + 3881143 commit dcdec44

File tree

2 files changed

+91
-0
lines changed

2 files changed

+91
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
| mpt |||||||||
2525
| falcon |||||||||
2626
| gpt-j |||||||||
27+
| XGen | ✅ | ❓ | ✅ | ❓ | ❓ | ❓ | ❓ | ✅
2728

2829

2930
## Quickstart ⚡

examples/xgen-7b/xgen-7b-8k-qlora.yml

+90
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# An example finetuning Saleforce's XGen-7b model with 8k context using qlora
2+
# on Tim Dettmer's Guanaco dataset.
3+
base_model: Salesforce/xgen-7b-8k-base
4+
base_model_config: Salesforce/xgen-7b-8k-base
5+
trust_remote_code: true
6+
model_type: AutoModelForCausalLM
7+
tokenizer_type: AutoTokenizer
8+
load_in_8bit: false
9+
# enable 4bit for QLoRA
10+
load_in_4bit: true
11+
gptq: false
12+
strict: false
13+
push_dataset_to_hub:
14+
datasets:
15+
- path: timdettmers/openassistant-guanaco
16+
data_files:
17+
- openassistant_best_replies_train.jsonl
18+
type: "completion"
19+
dataset_prepared_path: last_run_prepared
20+
val_set_size: 0.01
21+
# enable QLoRA
22+
adapter: qlora
23+
lora_model_dir:
24+
sequence_len: 8192
25+
max_packed_sequence_len:
26+
27+
# hyperparameters from QLoRA paper Appendix B.2
28+
# "We find hyperparameters to be largely robust across datasets"
29+
lora_r: 64
30+
lora_alpha: 16
31+
# 0.1 for models up to 13B
32+
# 0.05 for 33B and 65B models
33+
lora_dropout: 0.05
34+
# add LoRA modules on all linear layers of the base model
35+
lora_target_modules:
36+
lora_target_linear: true
37+
lora_fan_in_fan_out:
38+
39+
wandb_project:
40+
wandb_watch:
41+
wandb_run_id:
42+
wandb_log_model:
43+
output_dir: ./qlora-out
44+
45+
# QLoRA paper Table 9
46+
# - 16 for 7b & 13b
47+
# - 32 for 33b, 64 for 64b
48+
# Max size tested on A6000
49+
# - 7b: 40
50+
# - 40b: 4
51+
# decrease if OOM, increase for max VRAM utilization
52+
micro_batch_size: 1
53+
gradient_accumulation_steps: 1
54+
num_epochs: 3
55+
# Optimizer for QLoRA
56+
optimizer: paged_adamw_32bit
57+
torchdistx_path:
58+
lr_scheduler: cosine
59+
# QLoRA paper Table 9
60+
# - 2e-4 for 7b & 13b
61+
# - 1e-4 for 33b & 64b
62+
learning_rate: 0.00002
63+
train_on_inputs: false
64+
group_by_length: false
65+
bf16: true
66+
fp16: false
67+
tf32: false
68+
gradient_checkpointing: true
69+
# stop training after this many evaluation losses have increased in a row
70+
# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
71+
early_stopping_patience: 3
72+
resume_from_checkpoint:
73+
auto_resume_from_checkpoints: true
74+
local_rank:
75+
logging_steps: 1
76+
xformers_attention: true
77+
flash_attention:
78+
gptq_groupsize:
79+
gptq_model_v1:
80+
warmup_steps: 10
81+
eval_steps: 50
82+
save_steps: 50
83+
debug:
84+
deepspeed:
85+
weight_decay: 0.0
86+
special_tokens:
87+
eos_token: "<|endoftext|>"
88+
bos_token: "<|endoftext|>"
89+
unk_token: "<|endoftext|>"
90+
pad_token: "<|endoftext|>"

0 commit comments

Comments
 (0)