From 3bec5de82ad50b843668210176cbffc481382f7f Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Tue, 2 Aug 2022 13:19:46 -0700
Subject: [PATCH 1/8] add ds inject policies

---
 mii/policies/__init__.py | 13 ++++++
 mii/policies/bert.py     | 69 ++++++++++++++++++++++++++++++++
 mii/policies/bloom.py    | 46 ++++++++++++++++++++++
 mii/policies/gpt2.py     | 44 +++++++++++++++++++++
 mii/policies/gpt_neo.py  | 51 ++++++++++++++++++++++++
 mii/policies/gpt_neox.py | 59 ++++++++++++++++++++++++++++
 mii/policies/gptj.py     | 51 ++++++++++++++++++++++++
 mii/policies/megatron.py | 85 ++++++++++++++++++++++++++++++++++++++++
 8 files changed, 418 insertions(+)
 create mode 100644 mii/policies/__init__.py
 create mode 100644 mii/policies/bert.py
 create mode 100644 mii/policies/bloom.py
 create mode 100644 mii/policies/gpt2.py
 create mode 100644 mii/policies/gpt_neo.py
 create mode 100644 mii/policies/gpt_neox.py
 create mode 100644 mii/policies/gptj.py
 create mode 100644 mii/policies/megatron.py

diff --git a/mii/policies/__init__.py b/mii/policies/__init__.py
new file mode 100644
index 00000000..06fa54a0
--- /dev/null
+++ b/mii/policies/__init__.py
@@ -0,0 +1,13 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+supported_models = [
+    HFBertLayerPolicy,
+    HFGPTNEOLayerPolicy,
+    GPTNEOXLayerPolicy,
+    HFGPTJLayerPolicy,
+    MegatronLayerPolicy,
+    HFGPT2LayerPolicy,
+    BLOOMLayerPolicy
+]
diff --git a/mii/policies/bert.py b/mii/policies/bert.py
new file mode 100644
index 00000000..aaf20471
--- /dev/null
+++ b/mii/policies/bert.py
@@ -0,0 +1,69 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+import torch
+from torch.nn.parameter import Parameter
+from deepspeed.module_inject.base_policy import InjectBasePolicy
+
+
+class HFBertLayerPolicy(InjectBasePolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=False, preln=False):
+        super().__init__(inference)
+        self.client_module = client_module
+        self.preln = preln
+        if HFBertLayerPolicy._orig_layer_class is None:
+            try:
+                import transformers
+                HFBertLayerPolicy._orig_layer_class = [
+                    transformers.models.bert.modeling_bert.BertLayer,
+                    transformers.models.roberta.modeling_roberta.RobertaLayer
+                ]
+            except:
+                HFBertLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attention.self.query.weight.shape[1], \
+                self.client_module.attention.self.num_attention_heads
+
+    def attention(self):
+        qw = self.client_module.attention.self.query.weight
+        qb = self.client_module.attention.self.query.bias
+        kw = self.client_module.attention.self.key.weight
+        kb = self.client_module.attention.self.key.bias
+        vw = self.client_module.attention.self.value.weight
+        vb = self.client_module.attention.self.value.bias
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
+
+        return self.linear_layer, \
+               qkvw, \
+               qkvb, \
+               self.client_module.attention.output.dense.weight, \
+               self.client_module.attention.output.dense.bias, \
+               self.scale_attention, \
+               self.is_megatron_v2
+
+    def mlp(self):
+        if self.preln:
+            intermediate_ff = self.client_module.intermediate.dense_act
+        else:
+            intermediate_ff = self.client_module.intermediate.dense
+
+        return self.linear_layer, intermediate_ff.weight, intermediate_ff.bias, \
+            self.client_module.output.dense.weight, \
+            self.client_module.output.dense.bias
+
+    def layerNorm(self):
+        if self.preln:
+            attention_layernorm = self.client_module.PostAttentionLayerNorm
+            transformer_layernorm = self.client_module.PreAttentionLayerNorm
+        else:
+            attention_layernorm = self.client_module.attention.output.LayerNorm
+            transformer_layernorm = self.client_module.output.LayerNorm
+        return attention_layernorm.weight, \
+               attention_layernorm.bias, \
+               transformer_layernorm.weight, \
+               transformer_layernorm.bias
diff --git a/mii/policies/bloom.py b/mii/policies/bloom.py
new file mode 100644
index 00000000..441d542c
--- /dev/null
+++ b/mii/policies/bloom.py
@@ -0,0 +1,46 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+from deepspeed.module_inject.base_policy import InjectBasePolicy
+
+
+class BLOOMLayerPolicy(DSPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference, linear_layer=True)
+        self.client_module = client_module
+        try:
+            import transformers
+            BLOOMLayerPolicy._orig_layer_class = transformers.models.bloom.modeling_bloom.BloomBlock
+            global supported_models
+            supported_models.update(
+                {transformers.models.bloom.modeling_bloom.BloomModel})
+        except:
+            BLOOMLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.self_attention.hidden_size, \
+                self.client_module.self_attention.num_heads
+
+    def attention(self):
+        return self.linear_layer, \
+                self.client_module.self_attention.query_key_value.weight, \
+                self.client_module.self_attention.query_key_value.bias, \
+                self.client_module.self_attention.dense.weight, \
+                self.client_module.self_attention.dense.bias, \
+                self.scale_attention, \
+                self.is_megatron_v2
+
+    def mlp(self):
+        return self.linear_layer, \
+            self.client_module.mlp.dense_h_to_4h.weight, \
+            self.client_module.mlp.dense_h_to_4h.bias, \
+            self.client_module.mlp.dense_4h_to_h.weight, \
+            self.client_module.mlp.dense_4h_to_h.bias
+
+    def layerNorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               self.client_module.post_attention_layernorm.bias, \
+               self.client_module.input_layernorm.weight, \
+               self.client_module.input_layernorm.bias
diff --git a/mii/policies/gpt2.py b/mii/policies/gpt2.py
new file mode 100644
index 00000000..b605d1ee
--- /dev/null
+++ b/mii/policies/gpt2.py
@@ -0,0 +1,44 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+from deepspeed.module_inject.base_policy import InjectBasePolicy
+
+
+class HFGPT2LayerPolicy(InjectBasePolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=True):
+        # HuggingFace GPT2 uses convolutional layer instead of linear layer
+        super().__init__(inference, linear_layer=False)
+        self.client_module = client_module
+        try:
+            import transformers
+            HFGPT2LayerPolicy._orig_layer_class = transformers.models.gpt2.modeling_gpt2.GPT2Block
+        except:
+            HFGPT2LayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attn.embed_dim, \
+                self.client_module.attn.num_heads
+
+    def attention(self):
+        return self.linear_layer, \
+                self.client_module.attn.c_attn.weight, \
+                self.client_module.attn.c_attn.bias, \
+                self.client_module.attn.c_proj.weight, \
+                self.client_module.attn.c_proj.bias, \
+                self.scale_attention, \
+                self.is_megatron_v2
+
+    def mlp(self):
+        return self.linear_layer, \
+            self.client_module.mlp.c_fc.weight, \
+            self.client_module.mlp.c_fc.bias, \
+            self.client_module.mlp.c_proj.weight, \
+            self.client_module.mlp.c_proj.bias
+
+    def layerNorm(self):
+        return self.client_module.ln_2.weight, \
+               self.client_module.ln_2.bias, \
+               self.client_module.ln_1.weight, \
+               self.client_module.ln_1.bias
diff --git a/mii/policies/gpt_neo.py b/mii/policies/gpt_neo.py
new file mode 100644
index 00000000..fe11051c
--- /dev/null
+++ b/mii/policies/gpt_neo.py
@@ -0,0 +1,51 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+import torch
+from torch.nn.parameter import Parameter
+from deepspeed.module_inject.base_policy import InjectBasePolicy
+
+
+class HFGPTNEOLayerPolicy(InjectBasePolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference, scale_attention=False)
+        self.client_module = client_module
+        try:
+            import transformers
+            HFGPTNEOLayerPolicy._orig_layer_class = transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoBlock
+        except:
+            HFGPTNEOLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attn.attention.q_proj.weight.shape[1], \
+                self.client_module.attn.attention.num_heads
+
+    def attention(self):
+        qw = self.client_module.attn.attention.q_proj.weight
+        kw = self.client_module.attn.attention.k_proj.weight
+        vw = self.client_module.attn.attention.v_proj.weight
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+
+        return self.linear_layer, \
+                qkvw, \
+                None, \
+                self.client_module.attn.attention.out_proj.weight, \
+                self.client_module.attn.attention.out_proj.bias, \
+                self.scale_attention, \
+               self.is_megatron_v2
+
+    def mlp(self):
+        return self.linear_layer, \
+                self.client_module.mlp.c_fc.weight, \
+                self.client_module.mlp.c_fc.bias, \
+                self.client_module.mlp.c_proj.weight, \
+                self.client_module.mlp.c_proj.bias
+
+    def layerNorm(self):
+        return self.client_module.ln_2.weight, \
+               self.client_module.ln_2.bias, \
+               self.client_module.ln_1.weight, \
+               self.client_module.ln_1.bias
diff --git a/mii/policies/gpt_neox.py b/mii/policies/gpt_neox.py
new file mode 100644
index 00000000..ae8d4b31
--- /dev/null
+++ b/mii/policies/gpt_neox.py
@@ -0,0 +1,59 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+import torch
+from deepspeed.module_inject.base_policy import InjectBasePolicy
+
+
+class GPTNEOXLayerPolicy(InjectBasePolicy):
+    _orig_layer_class = None
+    version = 0
+
+    def __init__(self, client_module, inference=True, megatron_v2=True):
+        super().__init__(inference, megatron_v2=megatron_v2)
+        self.client_module = client_module
+        if GPTNEOXLayerPolicy._orig_layer_class is None:
+            if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
+                GPTNEOXLayerPolicy._orig_layer_class = None
+            else:
+                try:
+                    from transformers import GPTNeoXLayer
+                    GPTNEOXLayerPolicy._orig_layer_class = GPTNeoXLayer
+                except ImportError:
+                    GPTNEOXLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        if GPTNEOXLayerPolicy.version == 0:
+            attention = self.client_module.attention
+        else:
+            attention = self.client_module.self_attention
+
+        return self.client_module.attention.query_key_value.weight.shape[1], \
+                self.client_module.attention.num_attention_heads
+
+    def attention(self):
+        if GPTNEOXLayerPolicy.version == 0:
+            attention = self.client_module.attention
+        else:
+            attention = self.client_module.self_attention
+
+        return self.linear_layer, \
+                attention.query_key_value.weight, \
+                attention.query_key_value.bias, \
+                attention.dense.weight, \
+                attention.dense.bias, \
+                self.scale_attention, \
+                self.is_megatron_v2
+
+    def mlp(self):
+        return self.linear_layer, \
+            self.client_module.mlp.dense_h_to_4h.weight, \
+            self.client_module.mlp.dense_h_to_4h.bias, \
+            self.client_module.mlp.dense_4h_to_h.weight, \
+            self.client_module.mlp.dense_4h_to_h.bias
+
+    def layerNorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               self.client_module.post_attention_layernorm.bias, \
+               self.client_module.input_layernorm.weight, \
+               self.client_module.input_layernorm.bias
diff --git a/mii/policies/gptj.py b/mii/policies/gptj.py
new file mode 100644
index 00000000..2baf1d39
--- /dev/null
+++ b/mii/policies/gptj.py
@@ -0,0 +1,51 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+import torch
+from torch.nn.parameter import Parameter
+from deepspeed.module_inject.base_policy import InjectBasePolicy
+
+
+class HFGPTJLayerPolicy(InjectBasePolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference, scale_attention=True)
+        self.client_module = client_module
+        try:
+            import transformers
+            HFGPTJLayerPolicy._orig_layer_class = transformers.models.gptj.modeling_gptj.GPTJBlock
+        except:
+            HFGPTJLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attn.q_proj.weight.shape[1], \
+                self.client_module.attn.num_attention_heads
+
+    def attention(self):
+        qw = self.client_module.attn.q_proj.weight
+        kw = self.client_module.attn.k_proj.weight
+        vw = self.client_module.attn.v_proj.weight
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+
+        return self.linear_layer, \
+                qkvw, \
+                None, \
+                self.client_module.attn.out_proj.weight, \
+                None, \
+                self.scale_attention, \
+               self.is_megatron_v2
+
+    def mlp(self):
+        return self.linear_layer, \
+                self.client_module.mlp.fc_in.weight, \
+                self.client_module.mlp.fc_in.bias, \
+                self.client_module.mlp.fc_out.weight, \
+                self.client_module.mlp.fc_out.bias
+
+    def layerNorm(self):
+        return None, \
+               None, \
+               self.client_module.ln_1.weight, \
+               self.client_module.ln_1.bias
diff --git a/mii/policies/megatron.py b/mii/policies/megatron.py
new file mode 100644
index 00000000..15eb01cd
--- /dev/null
+++ b/mii/policies/megatron.py
@@ -0,0 +1,85 @@
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+from packaging import version as pkg_version
+from deepspeed.module_inject.base_policy import InjectBasePolicy
+
+
+class MegatronLayerPolicy(DSPolicy):
+    _orig_layer_class = None
+    version = 0
+    moe_type = 'standard'
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference)
+        self.client_module = client_module
+        # we use megatron version to differentiate between the old and new
+        # megatron-lm source code
+        if MegatronLayerPolicy._orig_layer_class is None:
+            if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
+                MegatronLayerPolicy._orig_layer_class = None
+            else:
+                try:
+                    from megatron.model.transformer import ParallelTransformerLayer
+                    MegatronLayerPolicy._orig_layer_class = ParallelTransformerLayer
+                except ImportError:
+                    MegatronLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attention.query_key_value.weight.shape[1], \
+                self.client_module.attention.num_attention_heads
+
+    def attention(self):
+        if self.inference:
+            if MegatronLayerPolicy.version == 0:
+                attention = self.client_module.attention
+            else:
+                attention = self.client_module.self_attention
+
+        return self.linear_layer, \
+                attention.query_key_value.weight, \
+                attention.query_key_value.bias, \
+                attention.dense.weight, \
+                attention.dense.bias, \
+                self.scale_attention, \
+                self.is_megatron_v2
+
+    def mlp(self, moe_type='standard'):
+        from deepspeed.moe.utils import has_moe_layers
+        moe, _ = has_moe_layers(self.client_module)
+
+        if moe:
+            moe_experts = self.client_module.mlp.deepspeed_moe.experts.deepspeed_experts if moe_type == 'standard' else \
+                            self.client_module.mlp.moe.deepspeed_moe.experts.deepspeed_experts
+            num_experts = len(moe_experts)
+            if moe_type == 'standard':
+                return self.linear_layer, \
+                    [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
+                    [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
+                    [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
+                    [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)]
+            else:
+
+                return self.linear_layer, \
+                    [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
+                    [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
+                    [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
+                    [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)], \
+                    self.client_module.mlp.mlp.dense_h_to_4h.weight, \
+                    self.client_module.mlp.mlp.dense_h_to_4h.bias, \
+                    self.client_module.mlp.mlp.dense_4h_to_h.weight, \
+                    self.client_module.mlp.mlp.dense_4h_to_h.bias, \
+                    self.client_module.mlp.coefficient.weight
+
+        else:
+            return self.linear_layer, \
+                self.client_module.mlp.dense_h_to_4h.weight, \
+                self.client_module.mlp.dense_h_to_4h.bias, \
+                self.client_module.mlp.dense_4h_to_h.weight, \
+                self.client_module.mlp.dense_4h_to_h.bias
+
+    def layerNorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               self.client_module.post_attention_layernorm.bias, \
+               self.client_module.input_layernorm.weight, \
+               self.client_module.input_layernorm.bias

From 2d28e52ffaab6388e2869ebf217775d6ac868499 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Fri, 5 Aug 2022 16:24:58 -0700
Subject: [PATCH 2/8] fix imports

---
 mii/policies/__init__.py      | 15 ++++++++-------
 mii/policies/bloom.py         |  2 +-
 mii/policies/gpt_neox.py      |  1 +
 mii/policies/megatron.py      |  3 ++-
 requirements/requirements.txt |  1 +
 5 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/mii/policies/__init__.py b/mii/policies/__init__.py
index 06fa54a0..82b17026 100644
--- a/mii/policies/__init__.py
+++ b/mii/policies/__init__.py
@@ -1,13 +1,14 @@
 '''
 Copyright 2022 The Microsoft DeepSpeed Team
 '''
+import mii.policies
 
 supported_models = [
-    HFBertLayerPolicy,
-    HFGPTNEOLayerPolicy,
-    GPTNEOXLayerPolicy,
-    HFGPTJLayerPolicy,
-    MegatronLayerPolicy,
-    HFGPT2LayerPolicy,
-    BLOOMLayerPolicy
+    mii.policies.HFBertLayerPolicy,
+    mii.policies.HFGPTNEOLayerPolicy,
+    mii.policies.GPTNEOXLayerPolicy,
+    mii.policies.HFGPTJLayerPolicy,
+    mii.policies.MegatronLayerPolicy,
+    mii.policies.HFGPT2LayerPolicy,
+    mii.policies.BLOOMLayerPolicy
 ]
diff --git a/mii/policies/bloom.py b/mii/policies/bloom.py
index 441d542c..5aed42ee 100644
--- a/mii/policies/bloom.py
+++ b/mii/policies/bloom.py
@@ -4,7 +4,7 @@
 from deepspeed.module_inject.base_policy import InjectBasePolicy
 
 
-class BLOOMLayerPolicy(DSPolicy):
+class BLOOMLayerPolicy(InjectBasePolicy):
     _orig_layer_class = None
 
     def __init__(self, client_module, inference=True):
diff --git a/mii/policies/gpt_neox.py b/mii/policies/gpt_neox.py
index ae8d4b31..7e519581 100644
--- a/mii/policies/gpt_neox.py
+++ b/mii/policies/gpt_neox.py
@@ -2,6 +2,7 @@
 Copyright 2022 The Microsoft DeepSpeed Team
 '''
 import torch
+from packaging import version as pkg_version
 from deepspeed.module_inject.base_policy import InjectBasePolicy
 
 
diff --git a/mii/policies/megatron.py b/mii/policies/megatron.py
index 15eb01cd..33683ecd 100644
--- a/mii/policies/megatron.py
+++ b/mii/policies/megatron.py
@@ -1,11 +1,12 @@
 '''
 Copyright 2022 The Microsoft DeepSpeed Team
 '''
+import torch
 from packaging import version as pkg_version
 from deepspeed.module_inject.base_policy import InjectBasePolicy
 
 
-class MegatronLayerPolicy(DSPolicy):
+class MegatronLayerPolicy(InjectBasePolicy):
     _orig_layer_class = None
     version = 0
     moe_type = 'standard'
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 5f1370ae..570715da 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -2,6 +2,7 @@ asyncio
 deepspeed>=0.6.7
 grpcio
 grpcio-tools
+packaging
 pydantic
 torch
 transformers

From 42504d6fee479d10e501b2ebf16b2f800c4a962d Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Fri, 5 Aug 2022 16:29:19 -0700
Subject: [PATCH 3/8] install latest ds

---
 .github/workflows/formatting.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
index 6aa26187..23b3adf9 100644
--- a/.github/workflows/formatting.yml
+++ b/.github/workflows/formatting.yml
@@ -29,6 +29,7 @@ jobs:
 
       - name: Install MII
         run: |
+          pip install git+https://github.com/microsoft/deepspeed.git
           pip install .[dev]
 
       - name: Formatting checks

From 48ea217b7231593c59596372a840e9016f7416db Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Fri, 5 Aug 2022 17:35:16 -0700
Subject: [PATCH 4/8] fix policy imports

---
 mii/policies/__init__.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/mii/policies/__init__.py b/mii/policies/__init__.py
index 82b17026..89164ca9 100644
--- a/mii/policies/__init__.py
+++ b/mii/policies/__init__.py
@@ -1,14 +1,20 @@
 '''
 Copyright 2022 The Microsoft DeepSpeed Team
 '''
-import mii.policies
+from .bert import HFBertLayerPolicy
+from .gpt_neo import HFGPTNEOLayerPolicy
+from .gpt_neox import GPTNEOXLayerPolicy
+from .gptj import HFGPTJLayerPolicy
+from .megatron import MegatronLayerPolicy
+from .gpt2 import HFGPT2LayerPolicy
+from .bloom import BLOOMLayerPolicy
 
-supported_models = [
-    mii.policies.HFBertLayerPolicy,
-    mii.policies.HFGPTNEOLayerPolicy,
-    mii.policies.GPTNEOXLayerPolicy,
-    mii.policies.HFGPTJLayerPolicy,
-    mii.policies.MegatronLayerPolicy,
-    mii.policies.HFGPT2LayerPolicy,
-    mii.policies.BLOOMLayerPolicy
+replace_policies = [
+    HFBertLayerPolicy,
+    HFGPTNEOLayerPolicy,
+    GPTNEOXLayerPolicy,
+    HFGPTJLayerPolicy,
+    MegatronLayerPolicy,
+    HFGPT2LayerPolicy,
+    BLOOMLayerPolicy
 ]

From 1c92d07da09716ac237b93372d2742b3ccde1b9f Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Fri, 5 Aug 2022 17:39:10 -0700
Subject: [PATCH 5/8] point to ds branch

---
 .github/workflows/nv-torch-latest-v100.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nv-torch-latest-v100.yaml b/.github/workflows/nv-torch-latest-v100.yaml
index 617ed8fa..d2729880 100644
--- a/.github/workflows/nv-torch-latest-v100.yaml
+++ b/.github/workflows/nv-torch-latest-v100.yaml
@@ -34,7 +34,7 @@ jobs:
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
       - name: Install MII
         run: |
-          pip install git+https://github.com/microsoft/DeepSpeed.git
+          pip install git+https://github.com/microsoft/DeepSpeed.git@staging-mii-update
           pip install git+https://github.com/huggingface/transformers.git
           pip install -U accelerate
           pip install .[dev,local]

From 4d420db39f8663de5e12e8239425492f6b4b8a9f Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Thu, 15 Sep 2022 10:52:05 -0700
Subject: [PATCH 6/8] align ds version

---
 .github/workflows/cpu.yml                   | 3 +--
 .github/workflows/formatting.yml            | 3 +--
 .github/workflows/nv-torch-latest-v100.yaml | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/cpu.yml b/.github/workflows/cpu.yml
index 743e243d..ab13bc92 100644
--- a/.github/workflows/cpu.yml
+++ b/.github/workflows/cpu.yml
@@ -29,8 +29,7 @@ jobs:
 
       - name: Install MII
         run: |
-          pip install git+https://github.com/microsoft/DeepSpeed.git
-          pip install .[dev,local]
+          pip install .[dev,local] git+https://github.com/microsoft/deepspeed.git@staging-mii-update
 
       - name: Unit tests
         run: |
diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
index 23b3adf9..ca38a4ff 100644
--- a/.github/workflows/formatting.yml
+++ b/.github/workflows/formatting.yml
@@ -29,8 +29,7 @@ jobs:
 
       - name: Install MII
         run: |
-          pip install git+https://github.com/microsoft/deepspeed.git
-          pip install .[dev]
+          pip install .[dev] git+https://github.com/microsoft/deepspeed.git@staging-mii-update
 
       - name: Formatting checks
         run: |
diff --git a/.github/workflows/nv-torch-latest-v100.yaml b/.github/workflows/nv-torch-latest-v100.yaml
index d2729880..fc61472d 100644
--- a/.github/workflows/nv-torch-latest-v100.yaml
+++ b/.github/workflows/nv-torch-latest-v100.yaml
@@ -34,10 +34,9 @@ jobs:
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
       - name: Install MII
         run: |
-          pip install git+https://github.com/microsoft/DeepSpeed.git@staging-mii-update
           pip install git+https://github.com/huggingface/transformers.git
           pip install -U accelerate
-          pip install .[dev,local]
+          pip install .[dev,local] git+https://github.com/microsoft/deepspeed.git@staging-mii-update
           ds_report
       - name: Unit tests
         run: |

From 9550973aa7e15a32ce1a5d924c8adff60041d274 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Thu, 15 Sep 2022 11:02:53 -0700
Subject: [PATCH 7/8] remove ds version min

---
 requirements/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 570715da..95f1ef4d 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,5 +1,5 @@
 asyncio
-deepspeed>=0.6.7
+deepspeed
 grpcio
 grpcio-tools
 packaging

From 0b1e37a558b509ec11c8700d136c3bac8ae9530a Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Thu, 15 Sep 2022 14:56:21 -0700
Subject: [PATCH 8/8] add release scripts

---
 release/bump_patch_version.py |  9 +++++++
 release/release.sh            | 49 +++++++++++++++++++++++++++++++++++
 version.txt                   |  2 +-
 3 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 release/bump_patch_version.py
 create mode 100644 release/release.sh

diff --git a/release/bump_patch_version.py b/release/bump_patch_version.py
new file mode 100644
index 00000000..8f1150de
--- /dev/null
+++ b/release/bump_patch_version.py
@@ -0,0 +1,9 @@
+from packaging import version as pkg_version
+
+with open('../version.txt') as fd:
+    version = pkg_version.parse(fd.read())
+
+with open('../version.txt', 'w') as fd:
+    fd.write(f'{version.major}.{version.minor}.{version.micro + 1}\n')
+
+print(f'{version} -> {version.major}.{version.minor}.{version.micro + 1}')
diff --git a/release/release.sh b/release/release.sh
new file mode 100644
index 00000000..ab56ba3c
--- /dev/null
+++ b/release/release.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+cd ..
+
+if [ ! -f ~/.pypirc ]; then
+    echo 'create .pypirc in order to upload to PyPI'
+    exit 1
+fi
+
+version=$1
+
+if [ -z $version ]; then
+    echo "please provide version number for release"
+    exit 1
+fi
+
+if [[ $version == *"v"* ]]; then
+    echo "please only include version number without 'v' prefix"
+    exit 1
+fi
+
+if [ "${version}" != `cat version.txt` ]; then
+    echo "version=${version} does not match version.txt"
+    cat version.txt
+    exit 1
+fi
+
+python -c "import twine"
+if [ $? != 0 ]; then
+    echo 'please install twine via pip'
+    exit 1
+fi
+
+MII_BUILD_STRING="" python setup.py sdist
+
+if [ ! -f dist/deepspeed-${version}.tar.gz ]; then
+    echo "prepared version does not match version given ($version), bump version first?"
+    ls dist
+    exit 1
+fi
+
+python -m twine upload dist/mii-${version}.tar.gz --repository mii
+
+git tag v${version}
+git push origin v${version}
+
+echo "bumping up patch version"
+cd -
+python bump_patch_version.py
diff --git a/version.txt b/version.txt
index 6e8bf73a..8acdd82b 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.1.0
+0.0.1