Skip to content

Commit 5cd30d1

Browse files
authored
Ray fine-tuning example (#296)
1 parent 2a5d63c commit 5cd30d1

8 files changed

+1504
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
{
2+
"fp16": {
3+
"enabled": "auto"
4+
},
5+
"bf16": {
6+
"enabled": "auto"
7+
},
8+
"zero_optimization": {
9+
"stage": 3,
10+
"offload_optimizer": {
11+
"device": "cpu",
12+
"pin_memory": true
13+
},
14+
"offload_param": {
15+
"device": "cpu",
16+
"pin_memory": true
17+
},
18+
"overlap_comm": true,
19+
"contiguous_gradients": true,
20+
"sub_group_size": 1e9,
21+
"reduce_bucket_size": 5e8,
22+
"stage3_prefetch_bucket_size": 5e8,
23+
"stage3_param_persistence_threshold": 1e6,
24+
"stage3_max_live_parameters": 1e9,
25+
"stage3_max_reuse_distance": 1e9,
26+
"stage3_gather_16bit_weights_on_model_save": true,
27+
"round_robin_gradients": true
28+
},
29+
"gradient_accumulation_steps": "auto",
30+
"gradient_clipping": "auto",
31+
"steps_per_print": 10,
32+
"train_batch_size": "auto",
33+
"train_micro_batch_size_per_gpu": "auto",
34+
"wall_clock_breakdown": false
35+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
"fp16": {
3+
"enabled": false
4+
},
5+
"bf16": {
6+
"enabled": true
7+
},
8+
"zero_optimization": {
9+
"stage": 3,
10+
"offload_optimizer": {
11+
"device": "cpu",
12+
"pin_memory": false
13+
},
14+
"overlap_comm": true,
15+
"contiguous_gradients": true,
16+
"reduce_bucket_size": "auto",
17+
"stage3_prefetch_bucket_size": "auto",
18+
"stage3_param_persistence_threshold": "auto",
19+
"gather_16bit_weights_on_model_save": true,
20+
"round_robin_gradients": true
21+
},
22+
"gradient_accumulation_steps": "auto",
23+
"gradient_clipping": "auto",
24+
"steps_per_print": 10,
25+
"train_batch_size": "auto",
26+
"train_micro_batch_size_per_gpu": "auto",
27+
"wall_clock_breakdown": false
28+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
{
2+
"fp16": {
3+
"enabled": "auto"
4+
},
5+
"bf16": {
6+
"enabled": "auto"
7+
},
8+
"zero_optimization": {
9+
"stage": 3,
10+
"offload_optimizer": {
11+
"device": "cpu",
12+
"pin_memory": true
13+
},
14+
"offload_param": {
15+
"device": "cpu",
16+
"pin_memory": true
17+
},
18+
"overlap_comm": true,
19+
"contiguous_gradients": true,
20+
"sub_group_size": 1e9,
21+
"reduce_bucket_size": 5e8,
22+
"stage3_prefetch_bucket_size": 5e8,
23+
"stage3_param_persistence_threshold": 1e6,
24+
"stage3_max_live_parameters": 1e9,
25+
"stage3_max_reuse_distance": 1e9,
26+
"stage3_gather_16bit_weights_on_model_save": true,
27+
"round_robin_gradients": true
28+
},
29+
"gradient_accumulation_steps": "auto",
30+
"gradient_clipping": "auto",
31+
"steps_per_print": 10,
32+
"train_batch_size": "auto",
33+
"train_micro_batch_size_per_gpu": "auto",
34+
"wall_clock_breakdown": false
35+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"fp16": {
3+
"enabled": "auto"
4+
},
5+
"bf16": {
6+
"enabled": "auto"
7+
},
8+
"zero_optimization": {
9+
"stage": 3,
10+
"offload_optimizer": {
11+
"device": "cpu",
12+
"pin_memory": false
13+
},
14+
"offload_param": {
15+
"device": "cpu",
16+
"pin_memory": false
17+
},
18+
"overlap_comm": true,
19+
"contiguous_gradients": true,
20+
"reduce_bucket_size": "auto",
21+
"stage3_prefetch_bucket_size": "auto",
22+
"stage3_param_persistence_threshold": "auto",
23+
"gather_16bit_weights_on_model_save": true,
24+
"round_robin_gradients": true
25+
},
26+
"gradient_accumulation_steps": "auto",
27+
"gradient_clipping": "auto",
28+
"steps_per_print": 10,
29+
"train_batch_size": "auto",
30+
"train_micro_batch_size_per_gpu": "auto",
31+
"wall_clock_breakdown": false
32+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"r": 8,
3+
"lora_alpha": 16,
4+
"lora_dropout": 0.05,
5+
"target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"],
6+
"task_type": "CAUSAL_LM",
7+
"modules_to_save": [],
8+
"bias": "none",
9+
"fan_in_fan_out": false,
10+
"init_lora_weights": true
11+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
pip:
2+
- transformers==4.44.0
3+
- accelerate==0.31.0
4+
- peft==0.11.1
5+
- deepspeed==0.16.2
6+
env_vars:
7+
LIBRARY_PATH: "$CUDA_HOME/lib64:$LIBRARY_PATH"
8+
PROJECT_DIR: "/home/yarnapp/hopsfs"
9+
TRAINED_MODEL_STORAGE_PATH: "${PROJECT_DIR}/Resources/llama_finetuning/fine-tuned-model"
10+
TRAINING_DATA_DIR: "${PROJECT_DIR}/Resources/llama_finetuning/datasets"
11+
TRAINING_CONFIGURATION_DIR: "${PROJECT_DIR}/Resources/llama_finetuning/configs"
12+

0 commit comments

Comments
 (0)