mattdangerw
diff --git a/‎keras_nlp/models/distilbert/distilbert_models.py
Lines changed: 62 additions & 1 deletion b/‎keras_nlp/models/distilbert/distilbert_models.py
Lines changed: 62 additions & 1 deletion
diff --git a/‎keras_nlp/models/distilbert/distilbert_preprocessing.py
Lines changed: 114 additions & 4 deletions b/‎keras_nlp/models/distilbert/distilbert_preprocessing.py
Lines changed: 114 additions & 4 deletions
diff --git a/‎keras_nlp/models/distilbert/distilbert_presets.py
Lines changed: 83 additions & 0 deletions b/‎keras_nlp/models/distilbert/distilbert_presets.py
Lines changed: 83 additions & 0 deletions
@@ -14,6 +14,8 @@
 
 """DistilBERT backbone models."""
 
+import copy
+import os
 
 import tensorflow as tf
 from tensorflow import keras
@@ -22,6 +24,9 @@
     TokenAndPositionEmbedding,
 )
 from keras_nlp.layers.transformer_encoder import TransformerEncoder
+from keras_nlp.models.distilbert.distilbert_presets import backbone_presets
+from keras_nlp.utils.python_utils import classproperty
+from keras_nlp.utils.python_utils import format_docstring
 
 
 def distilbert_kernel_initializer(stddev=0.02):
@@ -178,11 +183,67 @@ def get_config(self):
     def from_config(cls, config):
         return cls(**config)
 
+    @classproperty
+    def presets(cls):
+        return copy.deepcopy(backbone_presets)
+
     @classmethod
+    @format_docstring(names=", ".join(backbone_presets))
     def from_preset(
         cls,
         preset,
         load_weights=True,
         **kwargs,
     ):
-        raise NotImplementedError
+        """Instantiate DistilBERT model from preset architecture and weights.
+
+        Args:
+            preset: string. Must be one of {{names}}.
+            load_weights: Whether to load pre-trained weights into model.
+                Defaults to `True`.
+
+        Examples:
+        ```python
+        input_data = {
+            "token_ids": tf.random.uniform(
+                shape=(1, 12), dtype=tf.int64, maxval=model.vocabulary_size
+            ),
+            "padding_mask": tf.constant(
+                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
+            ),
+        }
+
+        # Load architecture and weights from preset
+        model = keras_nlp.models.DistilBert.from_preset(
+            "distilbert_base_uncased_en"
+        )
+        output = model(input_data)
+
+        # Load randomly initalized model from preset architecture
+        model = keras_nlp.models.DistilBert.from_preset(
+            "distilbert_base_uncased_en", load_weights=False
+        )
+        output = model(input_data)
+        ```
+        """
+        if preset not in cls.presets:
+            raise ValueError(
+                "`preset` must be one of "
+                f"""{", ".join(cls.presets)}. Received: {preset}."""
+            )
+        metadata = cls.presets[preset]
+        config = metadata["config"]
+        model = cls.from_config({**config, **kwargs})
+
+        if not load_weights:
+            return model
+
+        weights = keras.utils.get_file(
+            "model.h5",
+            metadata["weights_url"],
+            cache_subdir=os.path.join("models", preset),
+            file_hash=metadata["weights_hash"],
+        )
+
+        model.load_weights(weights)
+        return model
@@ -13,11 +13,16 @@
 # limitations under the License.
 """DistilBERT preprocessing layers."""
 
+import copy
+import os
+
 from tensorflow import keras
 
 from keras_nlp.layers.multi_segment_packer import MultiSegmentPacker
+from keras_nlp.models.distilbert.distilbert_presets import backbone_presets
 from keras_nlp.tokenizers.word_piece_tokenizer import WordPieceTokenizer
 from keras_nlp.utils.python_utils import classproperty
+from keras_nlp.utils.python_utils import format_docstring
 
 
 @keras.utils.register_keras_serializable(package="keras_nlp")
@@ -104,15 +109,56 @@ def __init__(
 
     @classproperty
     def presets(cls):
-        raise NotImplementedError
+        return copy.deepcopy(backbone_presets)
 
     @classmethod
+    @format_docstring(names=", ".join(backbone_presets))
     def from_preset(
         cls,
         preset,
         **kwargs,
     ):
-        raise NotImplementedError
+        """Instantiate a DistilBERT tokenizer from preset vocabulary.
+
+        Args:
+            preset: string. Must be one of {{names}}.
+
+        Examples:
+        ```python
+        # Load a preset tokenizer.
+        tokenizer = keras_nlp.models.DistilBertTokenizer.from_preset(
+            "distilbert_base_uncased_en",
+        )
+
+        # Tokenize some input.
+        tokenizer("The quick brown fox tripped.")
+
+        # Detokenize some input.
+        tokenizer.detokenize([5, 6, 7, 8, 9])
+        ```
+        """
+        if preset not in cls.presets:
+            raise ValueError(
+                "`preset` must be one of "
+                f"""{", ".join(cls.presets)}. Received: {preset}."""
+            )
+        metadata = cls.presets[preset]
+
+        vocabulary = keras.utils.get_file(
+            "vocab.txt",
+            metadata["vocabulary_url"],
+            cache_subdir=os.path.join("models", preset),
+            file_hash=metadata["vocabulary_hash"],
+        )
+
+        config = metadata["preprocessor_config"]
+        config.update(
+            {
+                "vocabulary": vocabulary,
+            },
+        )
+
+        return cls.from_config({**config, **kwargs})
 
 
 @keras.utils.register_keras_serializable(package="keras_nlp")
@@ -238,14 +284,78 @@ def call(self, inputs):
 
     @classproperty
     def presets(cls):
-        raise NotImplementedError
+        return copy.deepcopy(backbone_presets)
 
     @classmethod
+    @format_docstring(names=", ".join(backbone_presets))
     def from_preset(
         cls,
         preset,
         sequence_length=None,
         truncate="round_robin",
         **kwargs,
     ):
-        raise NotImplementedError
+        """Instantiate DistilBERT preprocessor from preset architecture.
+
+        Args:
+            preset: string. Must be one of {{names}}.
+            sequence_length: int, optional. The length of the packed inputs.
+                Must be equal to or smaller than the `max_sequence_length` of
+                the preset. If left as default, the `max_sequence_length` of
+                the preset will be used.
+            truncate: string. The algorithm to truncate a list of batched
+                segments to fit within `sequence_length`. The value can be
+                either `round_robin` or `waterfall`:
+                    - `"round_robin"`: Available space is assigned one token at
+                        a time in a round-robin fashion to the inputs that still
+                        need some, until the limit is reached.
+                    - `"waterfall"`: The allocation of the budget is done using
+                        a "waterfall" algorithm that allocates quota in a
+                        left-to-right manner and fills up the buckets until we
+                        run out of budget. It supports an arbitrary number of
+                        segments.
+
+        Examples:
+        ```python
+        # Load preprocessor from preset
+        preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
+            "distilbert_base_uncased_en",
+        )
+        preprocessor("The quick brown fox jumped.")
+
+        # Override sequence_length
+        preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
+            "distilbert_base_uncased_en",
+            sequence_length=64
+        )
+        preprocessor("The quick brown fox jumped.")
+        ```
+        """
+        if preset not in cls.presets:
+            raise ValueError(
+                "`preset` must be one of "
+                f"""{", ".join(cls.presets)}. Received: {preset}."""
+            )
+
+        tokenizer = DistilBertTokenizer.from_preset(preset)
+
+        # Use model's `max_sequence_length` if `sequence_length` unspecified;
+        # otherwise check that `sequence_length` not too long.
+        metadata = cls.presets[preset]
+        max_sequence_length = metadata["config"]["max_sequence_length"]
+        if sequence_length is not None:
+            if sequence_length > max_sequence_length:
+                raise ValueError(
+                    f"`sequence_length` cannot be longer than `{preset}` "
+                    f"preset's `max_sequence_length` of {max_sequence_length}. "
+                    f"Received: {sequence_length}."
+                )
+        else:
+            sequence_length = max_sequence_length
+
+        return cls(
+            tokenizer=tokenizer,
+            sequence_length=sequence_length,
+            truncate=truncate,
+            **kwargs,
+        )
@@ -0,0 +1,83 @@
+# Copyright 2022 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+backbone_presets = {
+    "distilbert_base_uncased_en": {
+        "config": {
+            "vocabulary_size": 30522,
+            "num_layers": 6,
+            "num_heads": 12,
+            "hidden_dim": 768,
+            "intermediate_dim": 3072,
+            "dropout": 0.1,
+            "max_sequence_length": 512,
+        },
+        "preprocessor_config": {
+            "lowercase": True,
+        },
+        "description": (
+            "Base size of DistilBERT where all input is lowercased. "
+            "Trained on English Wikipedia + BooksCorpus using BERT as the "
+            "teacher model."
+        ),
+        "weights_url": "https://storage.googleapis.com/keras-nlp/models/distilbert_base_uncased_en/model.h5",
+        "weights_hash": "6625a649572e74086d74c46b8d0b0da3",
+        "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/distilbert_base_uncased_en/vocab.txt",
+        "vocabulary_hash": "64800d5d8528ce344256daf115d4965e",
+    },
+    "distilbert_base_cased_en": {
+        "config": {
+            "vocabulary_size": 28996,
+            "num_layers": 6,
+            "num_heads": 12,
+            "hidden_dim": 768,
+            "intermediate_dim": 3072,
+            "dropout": 0.1,
+            "max_sequence_length": 512,
+        },
+        "preprocessor_config": {
+            "lowercase": False,
+        },
+        "description": (
+            "Base size of DistilBERT where case is maintained. "
+            "Trained on English Wikipedia + BooksCorpus using BERT as the "
+            "teacher model."
+        ),
+        "weights_url": "https://storage.googleapis.com/keras-nlp/models/distilbert_base_cased_en/model.h5",
+        "weights_hash": "fa36aa6865978efbf85a5c8264e5eb57",
+        "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/distilbert_base_cased_en/vocab.txt",
+        "vocabulary_hash": "bb6ca9b42e790e5cd986bbb16444d0e0",
+    },
+    "distilbert_base_multi_cased": {
+        "config": {
+            "vocabulary_size": 119547,
+            "num_layers": 6,
+            "num_heads": 12,
+            "hidden_dim": 768,
+            "intermediate_dim": 3072,
+            "dropout": 0.1,
+            "max_sequence_length": 512,
+        },
+        "preprocessor_config": {
+            "lowercase": False,
+        },
+        "description": (
+            "Base size of DistilBERT. Trained on Wikipedias of 104 languages "
+            "using BERT the teacher model."
+        ),
+        "weights_url": "https://storage.googleapis.com/keras-nlp/models/distilbert_base_multi_cased/model.h5",
+        "weights_hash": "c0f11095e2a6455bd3b1a6d14800a7fa",
+        "vocabulary_url": "https://storage.googleapis.com/keras-nlp/models/distilbert_base_multi_cased/vocab.txt",
+        "vocabulary_hash": "d9d865138d17f1958502ed060ecfeeb6",
+    },
+}