EleutherAI · DayOfThePenguin · May 6, 2024 · May 6, 2024 · May 6, 2024 · May 6, 2024
@@ -363,77 +363,18 @@ For a more detailed guide to the features available and how to configure them, s
 
 ## Mixture of Experts
 
-GPT-NeoX includes multiple expert implementations for MoE. To select between them, specify `moe_type` of `megablocks` (default) or `deepspeed`.
+GPT-NeoX includes support for Dropless Mixture of Experts (DMoE) through the `megablocks` library. It is compatible with both existing Megatron Tensor Parallelism and DeepSpeed Pipeline Parallel setups.
 
-Both are based on the DeepSpeed MoE parallelism framework, which supports tensor-expert-data parallelism.
-Both allow you to toggle between token-dropping and dropless (default, and this is what Megablocks was designed for).
-Sinkhorn routing to come soon!
+This implementation leverages the existing Tensor Parallel Group to also shard the expert weights.
+It uses Sinkhorn routing to avoid the need for a load balancing loss.
 
-For an example of a basic complete configuration, see configs/125M-dmoe.yml (for Megablocks dropless) or configs/125M-moe.yml.
+For an example of a basic complete configuration, see configs/125M-dmoe.yml.
 
-Most MoE related configuration arguments are prefixed with `moe`. Some common configuration parameters and their defaults are as follows:
+Most MoE related configuration arguments are prefixed with `moe`. The bare minimum addition to your configuration to enable MoE is as follows:
 
+```yaml
+moe_num_experts: 1 # 1 disables MoE. 8 is a common value.
 ```
-moe_type: megablocks
-moe_num_experts: 1 # 1 disables MoE. 8 is a reasonable value.
-moe_loss_coeff: 0.1
-expert_interval: 2 # See details below
-enable_expert_tensor_parallelism: false # See details below
-moe_expert_parallel_size: 1 # See details below
-moe_token_dropping: false
-```
-
-DeepSpeed can be further configured with the following:
-
-```
-moe_top_k: 1
-moe_min_capacity: 4
-moe_train_capacity_factor: 1.0 # Setting to 1.0
-moe_eval_capacity_factor: 1.0 # Setting to 1.0
-```
-
-One MoE layer is present every `expert_interval` transformer layers including the first, so with 12 layers total:
-
-```
-0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
-```
-
-Experts would be in these layers:
-
-```
-0, 2, 4, 6, 8, 10
-```
-
-By default, we use expert-data parallelism, so any available tensor parallelism (`model_parallel_size`) will be used for expert routing. For instance, given the following:
-
-```
-expert_parallel_size: 4
-model_parallel_size: 2 # aka tensor parallelism
-```
-
-With 32 GPUs, the behavior will be look like:
-
-- In non-expert layers:
-  - Tensor parallelism is 2. (There are 32 / 2 = 16 such tensor parallel groups, each of size 2.)
-  - Data parallelism implicitly becomes 32 / 2 = 16.
-- In expert layers:
-  - There is no tensor parallelism.
-  - Expert parallelism is 4. (There are 32 / 4 = 8 expert parallel groups, each of size 4.)
-  - Data parallelism implicitly becomes 32 / 4 = 8.  Some cross-node token routing happens as a result of this redivision of data parallelism between 16 and 8.  To avoid it, ensure that `expert_parallel_size == model_parallel_size`.
-
-Setting `enable_expert_tensor_parallelism` enables tensor-expert-data (TED) parallelism. The way to interpret the above would then be:
-
-- In non-expert layers: same as before.
-- In expert layers:
-  - Tensor parallelism is 2. (There are 32 / 2 = 16 tensor parallel groups, each of size 2.)
-  - Expert parallelism is 4. (There are 32 / 4 = 8 expert parallel groups, each of size 4.)
-  - Data parallelism implicitly becomes 32 / (2 * 4) = 4.  Again, cross-node token routing happens.  To avoid, ensure `expert_parallel_size == 1` or `model_parallel_size == 1`.
-
-So note that DP must be divisible by (MP * EP).  For more details, see the [TED paper].
-
-Pipeline parallelism is not yet supported - coming soon!
-
-[TED paper]: https://arxiv.org/abs/2303.06318
 
 # Datasets
 

@@ -1,15 +1,9 @@
 # GPT-2 pretraining setup
 {
-   # See README for MoE config docs!
-   "moe_type": "megablocks",
-   "moe_token_dropping": false,
-   # Have 4 experts per layer (every 2 layers by default)
-   "moe_num_experts": 4,
-   # parallelism settings
-   "enable_expert_tensor_parallelism": true,
-   "pipe_parallel_size": 1, # not yet supported for MoE
-   "model_parallel_size": 1,
-   "moe_expert_parallel_size": 1,
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,  # MoE supports PP
+   "model_parallel_size": 1, # MoE uses model parallel group to split both experts and attention weights
 
    # model settings
    "num_layers": 12,
@@ -21,18 +15,21 @@
    "pos_emb": "rotary",
    "no_weight_tying": true,
    "gpt_j_residual": false,
-   "output_layer_parallelism": "column",
+   "output_layer_parallelism": "column",   
+
+   # moe settings
+   "moe_num_experts": 8,
 
    # these should provide some speedup but takes a while to build, set to true if desired
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",
    "output_layer_init_method": "wang_init",
-
-
+
    # optimizer settings
    "optimizer": {
      "type": "Adam",
@@ -42,11 +39,11 @@
        "eps": 1.0e-8,
      }
    },
-   "min_lr": 0.00006,
-
+   "min_lr": 0.00006,    
+     
    # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
    "zero_optimization": {
-    "stage": 0,
+    "stage": 1,
     "allgather_partitions": True,
     "allgather_bucket_size": 500000000,
     "overlap_comm": True,
@@ -71,31 +68,26 @@
    "hidden_dropout": 0.0,
    "attention_dropout": 0.0,
 
-   # precision settings
-   "fp16": {
-     "enabled": true,
-     "loss_scale": 0,
-     "loss_scale_window": 1000,
-     "hysteresis": 2,
-     "min_loss_scale": 1
-   },
+   "precision": "bfloat16",
 
+   "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
+
    # misc. training settings
    "train_iters": 320000,
    "lr_decay_iters": 320000,
    "distributed_backend": "nccl",
    "lr_decay_style": "cosine",
-   "warmup": 0.01,
+   "warmup": 0.1,
    "checkpoint_factor": 10000,
    "eval_interval": 1000,
    "eval_iters": 10,
 
    # logging
-   "log_interval": 10,
+   "log_interval": 100,
    "steps_per_print": 10,
    "keep_last_n_checkpoints": 4,
    "wall_clock_breakdown": true,
 
-  #  networking
+  # networking
   "hostfile": "/mock_path"
 }