Skip to content

Commit 240958e

Browse files
committed
Add the hetero training scripts for Aquila-34B and Aquila-70B
1 parent 16bf31b commit 240958e

5 files changed

+228
-93
lines changed

examples/aquila/hetero/pretrain_aquila_distributed_hetero_dp.sh renamed to examples/aquila/34B/pretrain_aquila_34b_distributed_A800_16n_80g_A100_48n_40g_hetero_dp.sh

Lines changed: 41 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ EXPNAME=$2
55
HOSTFILE=$3
66
DATA_PATH=$4
77

8-
98
# Preapre the environment related configuration
109
source examples/aquila/env.sh
1110

@@ -18,7 +17,6 @@ SPECIAL_TOKENS_FILE=examples/aquila/tokenizer/special_tokens.txt
1817
CHECKPOINT_PATH=$PROJ_HOME/checkpoints/$EXPNAME
1918
mkdir -p $CHECKPOINT_PATH
2019
LOG_PATH=$PROJ_HOME/logs/$EXPNAME
21-
rm -rf $LOG_PATH
2220
mkdir -p $LOG_PATH
2321
cp $0 $LOG_PATH/
2422
TB_PATH=$PROJ_HOME/tboard/$EXPNAME
@@ -27,42 +25,40 @@ WB_PATH=$PROJ_HOME/wandb/$EXPNAME
2725
mkdir -p $WB_PATH
2826

2927
DISTRIBUTED_ARGS="
28+
--nproc_per_node $NODE_DEVICES \
3029
--nnodes $NUM_NODES \
3130
--node_rank $NODE_RANK \
32-
--nproc_per_node $NODE_DEVICES \
3331
--master_addr $MASTER_ADDR \
34-
--master_port $MASTER_PORT \
32+
--master_port $MASTER_PORT
3533
"
36-
# --log_dir $LOG_PATH --redirects 3 --tee 3
37-
38-
# DISTRIBUTED_ARGS="
39-
# --nnodes $NUM_NODES \
40-
# --rdzv_id "hetero" \
41-
# --nproc_per_node $NODE_DEVICES \
42-
# --rdzv-backend=c10d \
43-
# --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT
44-
# "
4534

4635
HETERO_ARGS="
4736
--hetero-mode dp \
48-
--hetero-device-types A800 A100 \
4937
--hetero-current-device-type $NODE_TYPE \
50-
--hetero-micro-batch-sizes 2 3 2 1 \
38+
--hetero-device-types A800 A100 \
39+
--hetero-micro-batch-sizes 8 2 16 1 \
5140
"
5241

5342
TRAINING_ARGS="
54-
--train-samples 40000 \
43+
--train-samples 488281250 \
44+
--rampup-batch-size 32 32 2000000 \
5545
--eval-iters 0 \
56-
--tensor-model-parallel-size 2 \
57-
--pipeline-model-parallel-size 2 \
58-
--global-batch-size 32 \
59-
--disable-bias-linear
46+
--eval-interval 2000 \
47+
--tensor-model-parallel-size 4 \
48+
--pipeline-model-parallel-size 4 \
49+
--make-vocab-size-divisible-by 64 \
50+
--global-batch-size 1024 \
51+
--disable-bias-linear \
52+
--use-flash-attn \
53+
--sequence-parallel \
54+
--use-distributed-optimizer
6055
"
6156

6257
MIXED_PRECISION_ARGS="
6358
--bf16 \
64-
--embedding-weights-in-fp32 \
6559
--attention-softmax-in-fp32 \
60+
--embedding-weights-in-fp32 \
61+
--rotary-position-embeddings-in-fp32 \
6662
--accumulate-allreduce-grads-in-fp32
6763
"
6864

@@ -78,24 +74,27 @@ DATA_ARGS="
7874
"
7975

8076
NETWORK_ARGS="
81-
--num-layers 8 \
82-
--hidden-size 4096 \
83-
--num-attention-heads 32 \
84-
--seq-length 2048 \
85-
--max-position-embeddings 2048 \
77+
--num-layers 60 \
78+
--hidden-size 6144 \
79+
--num-attention-heads 48 \
80+
--group-query-attention \
81+
--num-query-groups 8 \
82+
--hidden-dim-multiplier 1.3 \
83+
--seq-length 4096 \
84+
--max-position-embeddings 4096 \
8685
--layernorm-epsilon 1e-5 \
86+
--layernorm-init-weight 0.3 \
8787
--use-rotary-position-embeddings \
88-
--rotary-position-embeddings-in-fp32 \
8988
--no-position-embedding \
9089
--swiglu \
91-
--multiple-of 256 \
90+
--multiple-of 4096 \
9291
--apply-layernorm-rms \
9392
--untie-embeddings-and-output-weights
9493
"
9594

9695
INITIALIZATION_ARGS="
97-
--init-method-std 0.02 \
98-
--seed 1234
96+
--init-method-std 0.0165 \
97+
--seed 42
9998
"
10099

101100
REGULARIZATION_ARGS="
@@ -104,32 +103,31 @@ REGULARIZATION_ARGS="
104103
--weight-decay 0.1 \
105104
--adam-beta1 0.9 \
106105
--adam-beta2 0.95 \
107-
--clip-grad 0.0
106+
--clip-grad 1.0
108107
"
109108

110109
LEARNING_RATE_ARGS="
111-
--lr 2.0e-3 \
110+
--lr 1.5e-4 \
111+
--lr-decay-style cosine \
112+
--lr-warmup-samples 500000 \
113+
--min-lr 1.5e-5
112114
"
113-
# --min-lr 2.0e-6 \
114-
# --lr-decay-style cosine \
115-
# --lr-warmup-samples 1000
116115

117116
CHECKPOINTING_ARGS="
117+
--save-interval 1000 \
118+
--rampup-save-interval 5000 \
119+
--save $CHECKPOINT_PATH \
118120
--load $CHECKPOINT_PATH
119121
"
120-
# --save-interval 200000 \
121-
# --save $CHECKPOINT_PATH \
122122

123123
LOGGING_ARGS="
124124
--log-interval 1 \
125+
--tensorboard-dir $TB_PATH \
126+
--tensorboard-log-interval 1 \
127+
--wandb-dir $WB_PATH
125128
"
126-
# --wandb-dir $WB_PATH \
127-
# --tensorboard-dir $TB_PATH \
128-
# --tensorboard-log-interval 1
129-
130-
ENV_ARGS=""
131129

132-
cmd="$ENV_ARGS torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
130+
cmd="torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
133131
$HETERO_ARGS \
134132
$TRAINING_ARGS \
135133
$MIXED_PRECISION_ARGS \
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#!/bin/bash
2+
3+
PROJ_HOME=$1
4+
EXPNAME=$2
5+
HOSTFILE=$3
6+
DATA_PATH=$4
7+
8+
# Preapre the environment related configuration
9+
source examples/aquila/env.sh
10+
11+
# Define files related to tokenizer
12+
VOCAB_FILE=examples/aquila/tokenizer/vocab.json
13+
MERGE_FILE=examples/aquila/tokenizer/merges.txt
14+
SPECIAL_TOKENS_FILE=examples/aquila/tokenizer/special_tokens.txt
15+
16+
# Build some paths for the current training
17+
CHECKPOINT_PATH=$PROJ_HOME/checkpoints/$EXPNAME
18+
mkdir -p $CHECKPOINT_PATH
19+
LOG_PATH=$PROJ_HOME/logs/$EXPNAME
20+
mkdir -p $LOG_PATH
21+
cp $0 $LOG_PATH/
22+
TB_PATH=$PROJ_HOME/tboard/$EXPNAME
23+
mkdir -p $TB_PATH
24+
WB_PATH=$PROJ_HOME/wandb/$EXPNAME
25+
mkdir -p $WB_PATH
26+
27+
DISTRIBUTED_ARGS="
28+
--nproc_per_node $NODE_DEVICES \
29+
--nnodes $NUM_NODES \
30+
--node_rank $NODE_RANK \
31+
--master_addr $MASTER_ADDR \
32+
--master_port $MASTER_PORT
33+
"
34+
35+
HETERO_ARGS="
36+
--hetero-mode pp \
37+
--hetero-current-device-type $NODE_TYPE \
38+
--hetero-device-types A800 A100 \
39+
--hetero-pipeline-stages 1 15 3 15 15 15 \
40+
"
41+
42+
TRAINING_ARGS="
43+
--train-samples 488281250 \
44+
--rampup-batch-size 32 32 2000000 \
45+
--eval-iters 0 \
46+
--eval-interval 2000 \
47+
--tensor-model-parallel-size 4 \
48+
--pipeline-model-parallel-size 4 \
49+
--make-vocab-size-divisible-by 64 \
50+
--micro-batch-size 1 \
51+
--global-batch-size 1024 \
52+
--disable-bias-linear \
53+
--recompute-granularity 'full' \
54+
--recompute-method 'uniform' \
55+
--sequence-parallel \
56+
--use-distributed-optimizer
57+
"
58+
59+
MIXED_PRECISION_ARGS="
60+
--bf16 \
61+
--attention-softmax-in-fp32 \
62+
--embedding-weights-in-fp32 \
63+
--rotary-position-embeddings-in-fp32 \
64+
--accumulate-allreduce-grads-in-fp32
65+
"
66+
67+
DATA_ARGS="
68+
--data-path $DATA_PATH \
69+
--tokenizer-type AquilaTokenizer \
70+
--vocab-file $VOCAB_FILE \
71+
--vocab-size 100008\
72+
--merge-file $MERGE_FILE \
73+
--special-tokens-file $SPECIAL_TOKENS_FILE \
74+
--data-impl mmap \
75+
--split 1
76+
"
77+
78+
NETWORK_ARGS="
79+
--num-layers 60 \
80+
--hidden-size 6144 \
81+
--num-attention-heads 48 \
82+
--group-query-attention \
83+
--num-query-groups 8 \
84+
--hidden-dim-multiplier 1.3 \
85+
--seq-length 4096 \
86+
--max-position-embeddings 4096 \
87+
--layernorm-epsilon 1e-5 \
88+
--layernorm-init-weight 0.3 \
89+
--use-rotary-position-embeddings \
90+
--no-position-embedding \
91+
--swiglu \
92+
--multiple-of 4096 \
93+
--apply-layernorm-rms \
94+
--untie-embeddings-and-output-weights
95+
"
96+
97+
INITIALIZATION_ARGS="
98+
--init-method-std 0.0165 \
99+
--seed 42
100+
"
101+
102+
REGULARIZATION_ARGS="
103+
--attention-dropout 0.0 \
104+
--hidden-dropout 0.0 \
105+
--weight-decay 0.1 \
106+
--adam-beta1 0.9 \
107+
--adam-beta2 0.95 \
108+
--clip-grad 1.0
109+
"
110+
111+
LEARNING_RATE_ARGS="
112+
--lr 1.5e-4 \
113+
--lr-decay-style cosine \
114+
--lr-warmup-samples 500000 \
115+
--min-lr 1.5e-5
116+
"
117+
118+
CHECKPOINTING_ARGS="
119+
--save-interval 1000 \
120+
--rampup-save-interval 5000 \
121+
--save $CHECKPOINT_PATH \
122+
--load $CHECKPOINT_PATH
123+
"
124+
125+
LOGGING_ARGS="
126+
--log-interval 1 \
127+
--tensorboard-dir $TB_PATH \
128+
--tensorboard-log-interval 1 \
129+
--wandb-dir $WB_PATH
130+
"
131+
132+
cmd="torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
133+
$HETERO_ARGS \
134+
$TRAINING_ARGS \
135+
$MIXED_PRECISION_ARGS \
136+
$DATA_ARGS \
137+
$NETWORK_ARGS \
138+
$INITIALIZATION_ARGS \
139+
$REGULARIZATION_ARGS \
140+
$LEARNING_RATE_ARGS \
141+
$CHECKPOINTING_ARGS \
142+
$LOGGING_ARGS
143+
"
144+
echo $cmd
145+
eval $cmd

0 commit comments

Comments
 (0)