Stability-AI
diff --git a/Diff for: ‎.gitignore
+4-5 b/Diff for: ‎.gitignore
+4-5
diff --git a/Diff for: ‎stable_audio_tools/models/factory.py
+3 b/Diff for: ‎stable_audio_tools/models/factory.py
+3
diff --git a/Diff for: ‎stable_audio_tools/models/lm_backbone.py
+90-1 b/Diff for: ‎stable_audio_tools/models/lm_backbone.py
+90-1
@@ -1,8 +1,3 @@
-# Repo-specific 
-#  wav file created as part of the gradio demo
-output.wav
-
-
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -163,3 +158,7 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+*.ckpt
+*.wav
+wandb/*
@@ -23,6 +23,9 @@ def create_model_from_config(model_config):
     elif model_type == 'lm':
         from .lm import create_audio_lm_from_config
         return create_audio_lm_from_config(model_config)
+    elif model_type == 'nar':
+        from .nar import create_audio_nar_from_config
+        return create_audio_nar_from_config(model_config)
     else:
         raise NotImplementedError(f'Unknown model type: {model_type}')
 
 
@@ -1,9 +1,12 @@
 import torch
+import math
 from torch import nn
 from x_transformers import ContinuousTransformerWrapper, Decoder
+from functools import partial
 
 from mamba_ssm.utils.generation import InferenceParams
 from .transformer import ContinuousTransformer
+from .mambaplus.mamba import MambaPlus, MambaPlusConfig
 
 # Interface for backbone of a language model
 # Handles conditioning and cross-attention
@@ -253,4 +256,90 @@ def forward(self, x, mask=None, prepend_cond=None, prepend_cond_mask=None, cross
             self.cuda_graph.replay()
             return self.captured_logits.clone()
 
-        return self.model(x, inference_params=self.inference_params if use_cache else None)[:, prepend_length:, :]
+        return self.model(x, inference_params=self.inference_params if use_cache else None)[:, prepend_length:, :]
+    
+
+def _init_weights(
+    module,
+    n_layer,
+    initializer_range=0.02,  # Now only used for embedding layer.
+    rescale_prenorm_residual=True,
+    n_residuals_per_layer=1,  # Change to 2 if we have MLP
+):
+    if isinstance(module, nn.Linear):
+        if module.bias is not None:
+            if not getattr(module.bias, "_no_reinit", False):
+                nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+
+    if rescale_prenorm_residual:
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                # We need to reinit p since this code could be called multiple times
+                # Having just p *= scale would repeatedly scale it down
+                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
+                with torch.no_grad():
+                    p /= math.sqrt(n_residuals_per_layer * n_layer)
+
+class MambaPlusAudioLMBackbone(AudioLMBackbone):
+    def __init__(self,
+                 embed_dim: int = 512,
+                 n_layers: int = 32,
+                 d_state: int = 1,
+                 bidirectional: bool = False,
+                 num_mod_groups: int = 128,
+                 cross_attn_cond_dim: int = 0,
+                 prepend_cond_dim: int = 0,
+                 **kwargs):
+        super().__init__(embed_dim=embed_dim)
+
+        self.config = MambaPlusConfig(d_model=embed_dim, 
+                                  n_layers= n_layers, 
+                                  d_state = d_state, 
+                                  expand_factor=2, 
+                                  num_mod_groups = num_mod_groups, 
+                                  complex=True,
+                                  mamba_plus_enabled = True, 
+                                  bidirectional=bidirectional,
+                                  **kwargs)
+
+        # Embeddings are done in the AudioLanguageModel, so we use the continuous-input transformer
+        self.model = MambaPlus(
+            config=self.config,
+            **kwargs
+        )
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=self.config.n_layers
+            )
+        )
+        
+        if prepend_cond_dim > 0:
+            # Prepend conditioning
+            self.to_prepend_embed = nn.Sequential(
+                nn.Linear(prepend_cond_dim, embed_dim, bias=False)
+            )
+
+        assert (cross_attn_cond_dim == 0, "Cross-attention conditioning not supported for MambaPlus")
+
+    def forward(self, x, mask=None, prepend_cond=None, prepend_cond_mask=None, cross_attn_cond=None, use_cache=False):
+
+        prepend_length = 0
+        if prepend_cond is not None:
+            # Project the prepend conditioning to the embedding dimension
+            prepend_cond = self.to_prepend_embed(prepend_cond)
+            prepend_length = prepend_cond.shape[1]
+
+        x = torch.cat([prepend_cond, x], dim=1)
+        return self.model(x)[:, prepend_length:, :]
+