Reset all BiLMs (ELMo and when using BiLM); move reset to sentence encoder (#393)

Alex Wang · web-flow · commit 743de337bb9a · 2018-09-07T15:18:47.000-04:00
* Move reset elmo to sentence encoder

* Remove extra import

* Move reset state to sentenceencoder; reset biLM states also

* Remove reset elmo util function

* Remove reset elmo fn

* Add comment

* Dont use hasattr; remove check in trainer
diff --git a/src/evaluate.py b/src/evaluate.py
@@ -11,7 +11,6 @@
 from allennlp.data.iterators import BasicIterator
 from . import tasks as tasks_module
 from . import preprocess
-from . import utils
 
 from typing import List, Sequence, Iterable, Tuple, Dict
 
@@ -55,7 +54,6 @@ def evaluate(model, tasks: Sequence[tasks_module.Task], batch_size: int,
         dataset = getattr(task, "%s_data" % split)
         generator = iterator(dataset, num_epochs=1, shuffle=False, cuda_device=cuda_device)
         for batch_idx, batch in enumerate(generator):
-            utils.reset_elmo_states(model)
             out = model.forward(task, batch, predict=True)
             # We don't want mnli-diagnostic to affect the micro and macro average.
             # Accuracy of mnli-diagnostic is hardcoded to 0.
diff --git a/src/models.py b/src/models.py
@@ -652,6 +652,7 @@ def _positive_pair_sentence_forward(self, batch, task, predict):
             So rotating sent1/sent2 and pairing with sent2/sent1 is one way to obtain -ve pairs
         '''
         out = {}
+
         # embed the sentence
         sent1, mask1 = self.sent_encoder(batch['input1'], task)
         sent2, mask2 = self.sent_encoder(batch['input2'], task)
diff --git a/src/modules.py b/src/modules.py
@@ -62,7 +62,7 @@ def forward(self, embs, mask):
         return None
 
 class SentenceEncoder(Model):
-    ''' Given a sequence of tokens, embed each token and pass thru an LSTM. '''
+    ''' Given a sequence of tokens, embed each token and pass thru a sequence encoder. '''
     # NOTE: Do not apply dropout to the input of this module. Will be applied internally.
 
     def __init__(self, vocab, text_field_embedder, num_highway_layers, phrase_layer,
@@ -95,18 +95,23 @@ def __init__(self, vocab, text_field_embedder, num_highway_layers, phrase_layer,
 
         initializer(self)
 
-    def forward(self, sent, task):
+    def forward(self, sent, task, reset=True):
         # pylint: disable=arguments-differ
         """
         Args:
             - sent (Dict[str, torch.LongTensor]): From a ``TextField``.
             - task (Task): Used by the _text_field_embedder to pick the correct output
                            ELMo representation.
+            - reset (Bool): if True, manually reset the states of the ELMo LSTMs present
+                (if using BiLM or ELMo embeddings). Set False, if want to preserve statefulness.
         Returns:
             - sent_enc (torch.FloatTensor): (b_size, seq_len, d_emb)
                 the padded values in sent_enc are set to 0
             - sent_mask (torch.FloatTensor): (b_size, seq_len, d_emb); all 0/1s
         """
+        if reset:
+            self.reset_states()
+
         # Embeddings
         # Note: These highway modules are actually identity functions by default.
 
@@ -183,6 +188,14 @@ def forward(self, sent, task):
         sent_enc = sent_enc.masked_fill(pad_mask, 0)
         return sent_enc, sent_mask
 
+    def reset_states(self):
+        ''' Reset ELMo if present; reset BiLM (ELMoLSTM) states if present '''
+        if 'token_embedder_elmo' in [name for name, _ in self._text_field_embedder.named_children()] and \
+                '_elmo' in [name for name, _ in self._text_field_embedder.token_embedder_elmo.named_children()]:
+            self._text_field_embedder.token_embedder_elmo._elmo._elmo_lstm._elmo_lstm.reset_states()
+        if isinstance(self._phrase_layer, BiLMEncoder):
+            self._phrase_layer.reset_states()
+
 class BiLMEncoder(ElmoLstm):
     """Wrapper around BiLM to give it an interface to comply with SentEncoder
     See base class: ElmoLstm
diff --git a/src/trainer.py b/src/trainer.py
@@ -21,7 +21,7 @@
 from allennlp.training.learning_rate_schedulers import LearningRateScheduler  # pylint: disable=import-error
 from allennlp.training.optimizers import Optimizer  # pylint: disable=import-error
 
-from .utils import device_mapping, assert_for_log, reset_elmo_states  # pylint: disable=import-error
+from .utils import device_mapping, assert_for_log # pylint: disable=import-error
 from .evaluate import evaluate
 from . import config
 
@@ -434,8 +434,6 @@ def clip_function(grad): return grad.clamp(-self._grad_clipping, self._grad_clip
                 n_batches_since_val += 1
                 total_batches_trained += 1
                 optimizer.zero_grad()
-                if self._model.elmo:
-                    assert_for_log(self._model.sent_encoder._text_field_embedder.token_embedder_elmo._elmo._elmo_lstm._elmo_lstm._states is None, "Found carried over ELMo states!")
                 output_dict = self._forward(batch, task=task, for_training=True)
                 assert_for_log("loss" in output_dict,
                                "Model must return a dict containing a 'loss' key")
@@ -730,7 +728,6 @@ def _forward(self, batch, for_training, task=None):
         ''' At one point this does something, now it doesn't really do anything '''
         tensor_batch = batch
         model_out = self._model.forward(task, tensor_batch)
-        reset_elmo_states(self._model)
         return model_out
 
     def _description_from_metrics(self, metrics):
diff --git a/src/utils.py b/src/utils.py
@@ -16,18 +16,16 @@
 import numpy as np
 import torch
 from torch.autograd import Variable
-
-from allennlp.common.checks import ConfigurationError
-
-# Masked Multi headed self attention
 from torch.nn import Dropout, Linear
 from torch.nn import Parameter
 from torch.nn import init
 
+from allennlp.common.checks import ConfigurationError
 from allennlp.nn.util import last_dim_softmax
 from allennlp.modules.seq2seq_encoders.seq2seq_encoder import Seq2SeqEncoder
 from allennlp.common.params import Params
 
+
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 
@@ -38,12 +36,6 @@
 # a poor job of adding correct whitespace. Use unescape_xml() only.
 _MOSES_DETOKENIZER = MosesDetokenizer()
 
-def reset_elmo_states(model):
-    ''' Reset ELMo hidden states if ELMo is detected '''
-    if model.elmo:
-        model.sent_encoder._text_field_embedder.token_embedder_elmo._elmo._elmo_lstm._elmo_lstm.reset_states()
-    return
-
 def copy_iter(elems):
     '''Simple iterator yielding copies of elements.'''
     for elem in elems: