@@ -5,7 +5,6 @@ EXPNAME=$2
5
5
HOSTFILE=$3
6
6
DATA_PATH=$4
7
7
8
-
9
8
# Preapre the environment related configuration
10
9
source examples/aquila/env.sh
11
10
@@ -18,7 +17,6 @@ SPECIAL_TOKENS_FILE=examples/aquila/tokenizer/special_tokens.txt
18
17
CHECKPOINT_PATH=$PROJ_HOME /checkpoints/$EXPNAME
19
18
mkdir -p $CHECKPOINT_PATH
20
19
LOG_PATH=$PROJ_HOME /logs/$EXPNAME
21
- rm -rf $LOG_PATH
22
20
mkdir -p $LOG_PATH
23
21
cp $0 $LOG_PATH /
24
22
TB_PATH=$PROJ_HOME /tboard/$EXPNAME
@@ -27,42 +25,40 @@ WB_PATH=$PROJ_HOME/wandb/$EXPNAME
27
25
mkdir -p $WB_PATH
28
26
29
27
DISTRIBUTED_ARGS="
28
+ --nproc_per_node $NODE_DEVICES \
30
29
--nnodes $NUM_NODES \
31
30
--node_rank $NODE_RANK \
32
- --nproc_per_node $NODE_DEVICES \
33
31
--master_addr $MASTER_ADDR \
34
- --master_port $MASTER_PORT \
32
+ --master_port $MASTER_PORT
35
33
"
36
- # --log_dir $LOG_PATH --redirects 3 --tee 3
37
-
38
- # DISTRIBUTED_ARGS="
39
- # --nnodes $NUM_NODES \
40
- # --rdzv_id "hetero" \
41
- # --nproc_per_node $NODE_DEVICES \
42
- # --rdzv-backend=c10d \
43
- # --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT
44
- # "
45
34
46
35
HETERO_ARGS="
47
36
--hetero-mode dp \
48
- --hetero-device-types A800 A100 \
49
37
--hetero-current-device-type $NODE_TYPE \
50
- --hetero-micro-batch-sizes 2 3 2 1 \
38
+ --hetero-device-types A800 A100 \
39
+ --hetero-micro-batch-sizes 8 2 16 1 \
51
40
"
52
41
53
42
TRAINING_ARGS="
54
- --train-samples 40000 \
43
+ --train-samples 488281250 \
44
+ --rampup-batch-size 32 32 2000000 \
55
45
--eval-iters 0 \
56
- --tensor-model-parallel-size 2 \
57
- --pipeline-model-parallel-size 2 \
58
- --global-batch-size 32 \
59
- --disable-bias-linear
46
+ --eval-interval 2000 \
47
+ --tensor-model-parallel-size 4 \
48
+ --pipeline-model-parallel-size 4 \
49
+ --make-vocab-size-divisible-by 64 \
50
+ --global-batch-size 1024 \
51
+ --disable-bias-linear \
52
+ --use-flash-attn \
53
+ --sequence-parallel \
54
+ --use-distributed-optimizer
60
55
"
61
56
62
57
MIXED_PRECISION_ARGS="
63
58
--bf16 \
64
- --embedding-weights-in-fp32 \
65
59
--attention-softmax-in-fp32 \
60
+ --embedding-weights-in-fp32 \
61
+ --rotary-position-embeddings-in-fp32 \
66
62
--accumulate-allreduce-grads-in-fp32
67
63
"
68
64
@@ -78,24 +74,27 @@ DATA_ARGS="
78
74
"
79
75
80
76
NETWORK_ARGS="
81
- --num-layers 8 \
82
- --hidden-size 4096 \
83
- --num-attention-heads 32 \
84
- --seq-length 2048 \
85
- --max-position-embeddings 2048 \
77
+ --num-layers 60 \
78
+ --hidden-size 6144 \
79
+ --num-attention-heads 48 \
80
+ --group-query-attention \
81
+ --num-query-groups 8 \
82
+ --hidden-dim-multiplier 1.3 \
83
+ --seq-length 4096 \
84
+ --max-position-embeddings 4096 \
86
85
--layernorm-epsilon 1e-5 \
86
+ --layernorm-init-weight 0.3 \
87
87
--use-rotary-position-embeddings \
88
- --rotary-position-embeddings-in-fp32 \
89
88
--no-position-embedding \
90
89
--swiglu \
91
- --multiple-of 256 \
90
+ --multiple-of 4096 \
92
91
--apply-layernorm-rms \
93
92
--untie-embeddings-and-output-weights
94
93
"
95
94
96
95
INITIALIZATION_ARGS="
97
- --init-method-std 0.02 \
98
- --seed 1234
96
+ --init-method-std 0.0165 \
97
+ --seed 42
99
98
"
100
99
101
100
REGULARIZATION_ARGS="
@@ -104,32 +103,31 @@ REGULARIZATION_ARGS="
104
103
--weight-decay 0.1 \
105
104
--adam-beta1 0.9 \
106
105
--adam-beta2 0.95 \
107
- --clip-grad 0 .0
106
+ --clip-grad 1 .0
108
107
"
109
108
110
109
LEARNING_RATE_ARGS="
111
- --lr 2.0e-3 \
110
+ --lr 1.5e-4 \
111
+ --lr-decay-style cosine \
112
+ --lr-warmup-samples 500000 \
113
+ --min-lr 1.5e-5
112
114
"
113
- # --min-lr 2.0e-6 \
114
- # --lr-decay-style cosine \
115
- # --lr-warmup-samples 1000
116
115
117
116
CHECKPOINTING_ARGS="
117
+ --save-interval 1000 \
118
+ --rampup-save-interval 5000 \
119
+ --save $CHECKPOINT_PATH \
118
120
--load $CHECKPOINT_PATH
119
121
"
120
- # --save-interval 200000 \
121
- # --save $CHECKPOINT_PATH \
122
122
123
123
LOGGING_ARGS="
124
124
--log-interval 1 \
125
+ --tensorboard-dir $TB_PATH \
126
+ --tensorboard-log-interval 1 \
127
+ --wandb-dir $WB_PATH
125
128
"
126
- # --wandb-dir $WB_PATH \
127
- # --tensorboard-dir $TB_PATH \
128
- # --tensorboard-log-interval 1
129
-
130
- ENV_ARGS=" "
131
129
132
- cmd=" $ENV_ARGS torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
130
+ cmd=" torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
133
131
$HETERO_ARGS \
134
132
$TRAINING_ARGS \
135
133
$MIXED_PRECISION_ARGS \
0 commit comments