extra configs and debugging

EleutherAI · Jan 15, 2025 · a001467 · a001467
1 parent 945ae22
commit a001467
Show file tree

Hide file tree

Showing 50 changed files with 3,751 additions and 76 deletions.
diff --git a/configs/1-3B.yml → configs/1-3B-8k.yml b/configs/1-3B.yml → configs/1-3B-8k.yml
@@ -9,13 +9,18 @@
    "num_layers": 24,
    "hidden_size": 2048,
    "num_attention_heads": 16,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
+   "seq_length": 8192,
+   "max_position_embeddings": 8192,
    "norm": "layernorm",
    "pos_emb": "rotary",
    "no_weight_tying": true,
    "gpt_j_residual": false,
    "output_layer_parallelism": "column",
+   "profile": true,
+   "profile_step_start": 1,
+   "profile_step_stop": 5,
+
+   "attention_config": [[["flash"], 24]],
 
    # these should provide some speedup but takes a while to build, set to true if desired
    "scaled_upper_triang_masked_softmax_fusion": false,
@@ -50,7 +55,7 @@
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "train_micro_batch_size_per_gpu": 1,
    "data_impl": "mmap",
 
    # activation checkpointing
@@ -76,8 +81,8 @@
    },
 
    # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
+   "train_iters": 10,
+   "lr_decay_iters": 10,
    "distributed_backend": "nccl",
    "lr_decay_style": "cosine",
    "warmup": 0.01,
@@ -86,8 +91,8 @@
    "eval_iters": 10,
 
    # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
+   "log_interval": 1,
+   "steps_per_print": 1,
    "keep_last_n_checkpoints": 4,
    "wall_clock_breakdown": true,
 }
diff --git a/configs/1-3B-weak-scaling.yml b/configs/1-3B-weak-scaling.yml
@@ -0,0 +1,97 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+
+   # model settings
+   "num_layers": 24,
+   "hidden_size": 2048,
+   "num_attention_heads": 16,
+   "seq_length": 4096,
+   "max_position_embeddings": 4096,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   #"profile": true,
+   #"profile_step_start": 1,
+   #"profile_step_stop": 5,
+
+   "attention_config": [[["flash"], 24]],
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0002,
+       "betas": [0.9, 0.95],
+       "eps":  1.0e-8,
+     }
+   },
+   "min_lr": 0.00002,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 1,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 100,
+   "lr_decay_iters": 100,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 5,
+   "steps_per_print": 5,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/configs/10B-pp-scaling.yml b/configs/10B-pp-scaling.yml
@@ -0,0 +1,97 @@
+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 8,
+   "model_parallel_size": 1,
+
+   # model settings
+   "num_layers": 192,
+   "hidden_size": 2048,
+   "num_attention_heads": 16,
+   "seq_length": 4096,
+   "max_position_embeddings": 4096,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   #"profile": true,
+   #"profile_step_start": 1,
+   #"profile_step_stop": 5,
+
+   "attention_config": [[["flash"], 192]],
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0002,
+       "betas": [0.9, 0.95],
+       "eps":  1.0e-8,
+     }
+   },
+   "min_lr": 0.00002,
+
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 1,
+   "data_impl": "mmap",
+
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+
+   # misc. training settings
+   "train_iters": 100,
+   "lr_decay_iters": 100,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+
+   # logging
+   "log_interval": 5,
+   "steps_per_print": 5,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
diff --git a/configs/13B.yml → configs/13B-weak-scaling.yml b/configs/13B.yml → configs/13B-weak-scaling.yml
@@ -3,19 +3,20 @@
    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
    # across the node boundaries )
    "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
+   "model_parallel_size": 8,
 
    # model settings
    "num_layers": 40,
    "hidden_size": 5120,
    "num_attention_heads": 40,
-   "seq_length": 2048,
-   "max_position_embeddings": 2048,
+   "seq_length": 4096,
+   "max_position_embeddings": 4096,
    "norm": "layernorm",
    "pos_emb": "rotary",
    "no_weight_tying": true,
    "gpt_j_residual": false,
    "output_layer_parallelism": "column",
+   "attention_config": [[["flash"], 40]],
 
    # these should provide some speedup but takes a while to build, set to true if desired
    "scaled_upper_triang_masked_softmax_fusion": false,
@@ -51,7 +52,7 @@
    "min_lr": 0.00001,
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "train_micro_batch_size_per_gpu": 1,
    "data_impl": "mmap",
 
    # activation checkpointing
@@ -77,8 +78,8 @@
    },
 
    # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
+   "train_iters": 100,
+   "lr_decay_iters": 100,
    "distributed_backend": "nccl",
    "lr_decay_style": "cosine",
    "warmup": 0.01,
@@ -87,8 +88,9 @@
    "eval_iters": 10,
 
    # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
+   "log_interval": 5,
+   "steps_per_print": 5,
    "keep_last_n_checkpoints": 4,
    "wall_clock_breakdown": true,
+   "tensorboard_dir": "tensorboard-gpt",
 }
diff --git a/configs/2-7B.yml → configs/2-7B-weak-scaling.yml b/configs/2-7B.yml → configs/2-7B-weak-scaling.yml
@@ -3,7 +3,7 @@
    # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
    # across the node boundaries )
    "pipe_parallel_size": 1,
-   "model_parallel_size": 1,
+   "model_parallel_size": 2,
 
    # model settings
    "num_layers": 32,
@@ -16,6 +16,7 @@
    "no_weight_tying": true,
    "gpt_j_residual": false,
    "output_layer_parallelism": "column",
+   "attention_config": [[["flash"], 32]],
 
    # these should provide some speedup but takes a while to build, set to true if desired
    "scaled_upper_triang_masked_softmax_fusion": false,
@@ -50,7 +51,7 @@
   },
 
    # batch / data settings
-   "train_micro_batch_size_per_gpu": 4,
+   "train_micro_batch_size_per_gpu": 1,
    "data_impl": "mmap",
 
    # activation checkpointing
@@ -76,8 +77,8 @@
    },
 
    # misc. training settings
-   "train_iters": 320000,
-   "lr_decay_iters": 320000,
+   "train_iters": 100,
+   "lr_decay_iters": 100,
    "distributed_backend": "nccl",
    "lr_decay_style": "cosine",
    "warmup": 0.01,
@@ -86,8 +87,8 @@
    "eval_iters": 10,
 
    # logging
-   "log_interval": 100,
-   "steps_per_print": 10,
+   "log_interval": 5,
+   "steps_per_print": 5,
    "keep_last_n_checkpoints": 4,
    "wall_clock_breakdown": true,
 }