s2s decoder update (make more params active; add projection layer) (#384)

yukatherin · Alex Wang · commit c8f68c4946c1 · 2018-09-09T15:12:10.000-04:00
* quick fix for mt

* [veryminor] add mt_attention parameter to defaults.conf

* readme: mt-&gt;s2s

* [s2sdecoder] add bottleneck layer

* [s2sdecoder] make all parameters active

* clean beamsearch

* pull/384: address comments

* fix merge

* Add s2s_ prefix to opts; add projection documentation in code

* Rename s2s configs + refactor param dict construction; fix scheduled sampling bug; fix projected dim bug

* Redo decoder param construction

* Remove unneeded code

* Remove hasattr calls; fix bug when no attn

* Fix wrong param name
diff --git a/config/defaults.conf b/config/defaults.conf
@@ -222,12 +222,19 @@ classifier_loss_fn = ""  // Classifier loss function. Used only in some speciali
 classifier_span_pooling = "x,y"  // Span pooling type (for edge probing only).
                                  // Options: 'attn' or one of the 'combination' arguments accepted by AllenNLP's
                                  //   EndpointSpanExtractor.
+
+s2s {
+    d_hid_dec = 1024  // The hidden size of the decoder in seq2seq tasks.
+    n_layers_dec = 1  // The number of decoder layers in seq2seq tasks.
+    target_embedding_dim = 300  // The size of target word embeddings in seq2seq tasks.
+    attention = "bilinear"  // Attention used in s2s. Current implemented options are "bilinear" and "none".
+    output_proj_input_dim = 1024  // Dimension of bottleneck layer in s2s decoder output projection. If
+                                  // output_proj_input_dim == d_hid_dec, will not add projection.
+}
+
 edgeprobe_cnn_context = 0  // expanded context for edge probing via CNN.
                            // 0 looks at only the current word, 1 adds +/-
                            // words (kernel width 3), etc.
-d_hid_dec = 300  // The hidden size of the decoder in seq2seq tasks.
-n_layers_dec = 1  // The number of decoder layers in seq2seq tasks.
-mt_attention = "bilinear"  // Attention used in s2s. Current implemented options are "bilinear" and "none".
 
 // Training
 eval_val_interval = 500  // Comparable to val_interval, used during train_for_eval. Can be set separately per task.
diff --git a/src/generate_s2s.py b/src/generate_s2s.py
@@ -6,8 +6,6 @@
 from . import bleu_scoring
 import numpy as np
 
-""" Beam search was confirmed to be WRONG. Use greedy search"""
-
 
 def _get_word(decoder_vocab, word_idx):
     return decoder_vocab._index_to_token['targets'][word_idx]
diff --git a/src/models.py b/src/models.py
@@ -26,7 +26,6 @@
 from .utils import get_batch_utilization, get_elmo_mixing_weights
 from . import config
 from . import edge_probing
-#from . import beamsearch
 
 from .tasks import CCGTaggingTask, ClassificationTask, CoLATask, EdgeProbingTask, GroundedSWTask, \
     GroundedTask, LanguageModelingTask, MTTask, MultiNLIDiagnosticTask, PairClassificationTask, \
@@ -345,33 +344,37 @@ def build_module(task, model, d_sent, d_emb, vocab, embedder, args):
         module = edge_probing.EdgeClassifierModule(task, d_sent, task_params)
         setattr(model, '%s_mdl' % task.name, module)
     elif isinstance(task, (RedditSeq2SeqTask, Wiki103Seq2SeqTask)):
-        attention = args.mt_attention
-        log.info("using {} attention".format(attention))
+        log.info("using {} attention".format(args.s2s['attention']))
         decoder_params = Params({'input_dim': d_sent,
                                  'target_embedding_dim': 300,
+                                 'decoder_hidden_size': args.s2s['d_hid_dec'],
+                                 'output_proj_input_dim': args.s2s['output_proj_input_dim'],
                                  'max_decoding_steps': args.max_seq_len,
                                  'target_namespace': 'tokens',
-                                 'attention': attention,
+                                 'attention': args.s2s['attention'],
                                  'dropout': args.dropout,
                                  'scheduled_sampling_ratio': 0.0})
-        decoder = Seq2SeqDecoder.from_params(vocab, decoder_params)
+        decoder = Seq2SeqDecoder(vocab, **decoder_params)
         setattr(model, '%s_decoder' % task.name, decoder)
     elif isinstance(task, MTTask):
-        attention = args.mt_attention
-        log.info("using {} attention".format(attention))
+        log.info("using {} attention".format(args.s2s['attention']))
         decoder_params = Params({'input_dim': d_sent,
                                  'target_embedding_dim': 300,
-                                 'max_decoding_steps': 200,
+                                 'decoder_hidden_size': args.s2s['d_hid_dec'],
+                                 'output_proj_input_dim': args.s2s['output_proj_input_dim'],
+                                 'max_decoding_steps': args.max_seq_len,
                                  'target_namespace': task._label_namespace if hasattr(task, '_label_namespace') else 'targets',
-                                 'attention': attention,
+                                 'attention': args.s2s['attention'],
                                  'dropout': args.dropout,
                                  'scheduled_sampling_ratio': 0.0})
-        decoder = Seq2SeqDecoder.from_params(vocab, decoder_params)
+        decoder = Seq2SeqDecoder(vocab, **decoder_params)
         setattr(model, '%s_decoder' % task.name, decoder)
+
     elif isinstance(task, SequenceGenerationTask):
         decoder, hid2voc = build_decoder(task, d_sent, vocab, embedder, args)
         setattr(model, '%s_decoder' % task.name, decoder)
         setattr(model, '%s_hid2voc' % task.name, hid2voc)
+
     elif isinstance(task, (GroundedTask, GroundedSWTask)):
         task.img_encoder = CNNEncoder(model_name='resnet', path=task.path)
         pooler = build_image_sent_module(task, d_sent, task_params)
@@ -491,10 +494,10 @@ def build_decoder(task, d_inp, vocab, embedder, args):
     ''' Build a task specific decoder '''
     rnn = s2s_e.by_name('lstm').from_params(
         Params({'input_size': embedder.get_output_dim(),
-                'hidden_size': args.d_hid_dec,
-                'num_layers': args.n_layers_dec, 'bidirectional': False}))
+                'hidden_size': args.s2s['d_hid_dec'],
+                'num_layers': args.s2s['n_layers_dec'], 'bidirectional': False}))
     decoder = SentenceEncoder(vocab, embedder, 0, rnn)
-    hid2voc = nn.Linear(args.d_hid_dec, args.max_word_v_size)
+    hid2voc = nn.Linear(args.s2s['d_hid_dec'], args.max_word_v_size)
     return decoder, hid2voc
 
 
@@ -813,16 +816,6 @@ def _seq_gen_forward(self, batch, task, predict):
             out.update(decoder.forward(sent, sent_mask, batch['targs']))
             task.scorer1(out['loss'].item())
 
-            # Commented out for final run (still needs this for further debugging).
-            # We don't want to write predictions during training.
-            #if not self.training and not isinstance(task, Wiki103_Seq2Seq):
-            #    # bleu scoring
-            #    bleu_score, unk_ratio_macroavg = beamsearch.generate_and_compute_bleu(decoder, sent, sent_mask, batch['targs']['words'], preds_file_path=task.preds_file_path, task=task)
-            #    task.scorer2(bleu_score)
-            #    task.scorer3(unk_ratio_macroavg)
-
-            return out
-
         if 'targs' in batch:
             pass
 
diff --git a/src/seq2seq_decoder.py b/src/seq2seq_decoder.py
@@ -31,10 +31,14 @@ class Seq2SeqDecoder(Model):
     def __init__(self,
                  vocab: Vocabulary,
                  input_dim: int,
+                 decoder_hidden_size: int,
+                 max_decoding_steps: int,
+                 output_proj_input_dim: int,
                  target_namespace: str = "targets",
                  target_embedding_dim: int = None,
                  attention: str = "none",
                  dropout: float = 0.0,
+                 scheduled_sampling_ratio: float = 0.0,
                  ) -> None:
         super(Seq2SeqDecoder, self).__init__(vocab)
         self._max_decoding_steps = max_decoding_steps
@@ -50,26 +54,39 @@ def __init__(self,
         # Decoder output dim needs to be the same as the encoder output dim since we initialize the
         # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
         # we're using attention with ``DotProductSimilarity``, this is needed.
-        self._decoder_hidden_dim = input_dim
+        self._encoder_output_dim = input_dim
+        self._decoder_hidden_dim = decoder_hidden_size
+        if self._encoder_output_dim != self._decoder_hidden_dim:
+            self._projection_encoder_out = Linear(self._encoder_output_dim, self._decoder_hidden_dim)
+        else:
+            self._projection_encoder_out = lambda x: x
         self._decoder_output_dim = self._decoder_hidden_dim
-        # target_embedding_dim = target_embedding_dim #or self._source_embedder.get_output_dim()
+        self._output_proj_input_dim = output_proj_input_dim
         self._target_embedding_dim = target_embedding_dim
         self._target_embedder = Embedding(num_classes, self._target_embedding_dim)
 
-        self._sent_pooler = Pooler.from_params(input_dim, input_dim, False)
+        # Used to get an initial hidden state from the encoder states
+        self._sent_pooler = Pooler.from_params(d_inp=input_dim, d_proj=decoder_hidden_size, project=True)
 
         if attention == "bilinear":
-            self._decoder_attention = BilinearAttention(input_dim, input_dim)
+            self._decoder_attention = BilinearAttention(decoder_hidden_size, input_dim)
             # The output of attention, a weighted average over encoder outputs, will be
             # concatenated to the input vector of the decoder at each time step.
             self._decoder_input_dim = input_dim + target_embedding_dim
         elif attention == "none":
+            self._decoder_attention = None
             self._decoder_input_dim = target_embedding_dim
         else:
             raise Exception("attention not implemented {}".format(attention))
 
         self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_hidden_dim)
-        self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
+        # Allow for a bottleneck layer between encoder outputs and distribution over vocab
+        # The bottleneck layer consists of a linear transform and helps to reduce number of parameters
+        if self._output_proj_input_dim != self._decoder_output_dim:
+            self._projection_bottleneck = Linear(self._decoder_output_dim, self._output_proj_input_dim)
+        else:
+            self._projection_bottleneck = lambda x: x
+        self._output_projection_layer = Linear(self._output_proj_input_dim, num_classes)
         self._dropout = torch.nn.Dropout(p=dropout)
 
     def _initalize_hidden_context_states(self, encoder_outputs, encoder_outputs_mask):
@@ -80,10 +97,9 @@ def _initalize_hidden_context_states(self, encoder_outputs, encoder_outputs_mask
         encoder_outputs: torch.FloatTensor, [bs, T, h]
         encoder_outputs_mask: torch.LongTensor, [bs, T, 1]
         """
-        # very important - feel free to check it a third time
-        # idempotent / safe to run in place. encoder_outputs_mask should never
-        # change
-        if hasattr(self, "_decoder_attention") and self._decoder_attention:
+
+        if self._decoder_attention is not None:
+            encoder_outputs = self._projection_encoder_out(encoder_outputs)
             encoder_outputs.data.masked_fill_(1 - encoder_outputs_mask.byte().data, -float('inf'))
 
             decoder_hidden = encoder_outputs.new_zeros(encoder_outputs_mask.size(0), self._decoder_hidden_dim)
@@ -132,8 +148,10 @@ def forward(self,  # type: ignore
             decoder_hidden, decoder_context = self._decoder_cell(
                 decoder_input, (decoder_hidden, decoder_context))
 
+            # output projection
+            proj_input = self._projection_bottleneck(decoder_hidden)
             # (batch_size, num_classes)
-            output_projections = self._output_projection_layer(decoder_hidden)
+            output_projections = self._output_projection_layer(proj_input)
 
             # list of (batch_size, 1, num_classes)
             step_logit = output_projections.unsqueeze(1)
@@ -204,7 +222,7 @@ def _prepare_decode_step_input(
         # (batch_size, target_embedding_dim)
         embedded_input = self._target_embedder(input_indices)
 
-        if hasattr(self, "_decoder_attention") and self._decoder_attention:
+        if self._decoder_attention is not None:
             # encoder_outputs : (batch_size, input_sequence_length, encoder_output_dim)
             # Ensuring mask is also a FloatTensor. Or else the multiplication within attention will
             # complain.
@@ -221,9 +239,9 @@ def _prepare_decode_step_input(
             # (batch_size, input_sequence_length)
             input_weights = self._decoder_attention(
                 decoder_hidden_state, encoder_outputs, encoder_outputs_mask)
-            # (batch_size, encoder_output_dim)
+            # (batch_size, input_dim)
             attended_input = weighted_sum(encoder_outputs, input_weights)
-            # (batch_size, encoder_output_dim + target_embedding_dim)
+            # (batch_size, input_dim + target_embedding_dim)
             return torch.cat((attended_input, embedded_input), -1)
         else:
             return embedded_input
@@ -259,20 +277,3 @@ def _get_loss(logits: torch.LongTensor,
         relevant_mask = target_mask[:, 1:].contiguous()  # (batch_size, num_decoding_steps)
         loss = sequence_cross_entropy_with_logits(logits, relevant_targets, relevant_mask)
         return loss
-
-    @classmethod
-    def from_params(cls, vocab, params: Params) -> 'SimpleSeq2Seq':
-        input_dim = params.pop("input_dim")
-        max_decoding_steps = params.pop("max_decoding_steps")
-        target_namespace = params.pop("target_namespace", "targets")
-        target_embedding_dim = params.pop("target_embedding_dim")
-        attention = params.pop("attention", "none")
-        dropout = params.pop_float("dropout", 0.0)
-        params.assert_empty(cls.__name__)
-        return cls(vocab,
-                   input_dim=input_dim,
-                   target_embedding_dim=target_embedding_dim,
-                   max_decoding_steps=max_decoding_steps,
-                   target_namespace=target_namespace,
-                   attention=attention,
-                   dropout=dropout)
diff --git a/src/tasks.py b/src/tasks.py
@@ -1773,6 +1773,17 @@ def get_metrics(self, reset=False):
         return {'perplexity': math.exp(avg_nll), 'bleu_score': 0, 'unk_ratio_macroavg': unk_ratio_macroavg}
 
 
+@register_task('wmt_debug', rel_path='wmt_debug/', max_targ_v_size=5000)
+class MTDebug(MTTask):
+    def __init__(self, path, max_seq_len, max_targ_v_size, name='wmt_debug'):
+        ''' Demo task for MT with 10k training examples.'''
+        super().__init__(path=path, max_seq_len=max_seq_len,
+                         max_targ_v_size=max_targ_v_size, name=name)
+        self.files_by_split = {"train": os.path.join(path, "train.txt"),
+                               "val": os.path.join(path, "valid.txt"),
+                               "test": os.path.join(path, "test.txt")}
+
+
 @register_task('wmt17_en_ru', rel_path='wmt17_en_ru/', max_targ_v_size=20000)
 class MTTaskEnRu(MTTask):
     def __init__(self, path, max_seq_len, max_targ_v_size, name='mt_en_ru'):