Fixing index problem & minor pytorch_transformers_interface cleanup (#916)

HaokunLiu · sleepinyourhat · commit c36b74e31522 · 2019-09-20T16:43:47.000-04:00
* update boundry func with offsets

* update tasks that use indexes

* remove outdated temporary fix
diff --git a/jiant/preprocess.py b/jiant/preprocess.py
@@ -619,13 +619,6 @@ def add_pytorch_transformers_vocab(vocab, tokenizer_name):
 
     vocab_size = len(tokenizer)
     # do not use tokenizer.vocab_size, it does not include newly added token
-    if tokenizer_name.startswith("roberta-"):
-        if tokenizer.convert_ids_to_tokens(vocab_size - 1) is None:
-            vocab_size -= 1
-        else:
-            log.info("Time to delete vocab_size-1 in preprocess.py !!!")
-    # due to a quirk in huggingface's file, the last token of RobertaTokenizer is None, remove
-    # this when they fix the problem
 
     ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
     log.info("Added pytorch_transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab))
diff --git a/jiant/pytorch_transformers_interface/modules.py b/jiant/pytorch_transformers_interface/modules.py
@@ -88,11 +88,11 @@ def parameter_setup(self, args):
     def correct_sent_indexing(self, sent):
         """ Correct id difference between pytorch_transformers and AllenNLP.
         The AllenNLP indexer adds'@@UNKNOWN@@' token as index 1, and '@@PADDING@@' as index 0
-        
+
         args:
-            sent: batch dictionary, in which 
+            sent: batch dictionary, in which
                 sent[self.tokenizer_required]: <long> [batch_size, var_seq_len] input token IDs
-        
+
         returns:
             ids: <long> [bath_size, var_seq_len] corrected token IDs
             input_mask: <long> [bath_size, var_seq_len] mask of input sequence
@@ -185,42 +185,47 @@ def get_seg_ids(self, token_ids, input_mask):
         return seg_ids
 
     @staticmethod
-    def apply_boundary_tokens(s1, s2=None):
+    def apply_boundary_tokens(s1, s2=None, get_offset=False):
         """
         A function that appliese the appropriate EOS/SOS/SEP/CLS tokens to token sequence or
-        token sequence pair for most tasks. 
+        token sequence pair for most tasks.
         This function should be implmented in subclasses.
-        
+
         args:
             s1: list[str], tokens from sentence 1
             s2: list[str] (optional), tokens from sentence 2, used for pair embedding
-        
+            get_offset: bool, returns offset if True
+
         returns
             s: list[str], token sequence with boundry tokens
+            offset_s1 (optional): int, index offset of s1
+            offset_s2 (optional): int, index offset of s2
         """
         raise NotImplementedError
 
     @staticmethod
-    def apply_lm_boundary_tokens(s1):
+    def apply_lm_boundary_tokens(s1, get_offset=False):
         """
         A function that appliese the appropriate EOS/SOS/SEP/CLS tokens to a token sequence for
         language modeling tasks.
         This function should be implmented in subclasses.
-        
+
         args:
             s1: list[str], tokens from sentence
-        
+            get_offset: bool, returns offset if True
+
         returns
             s: list[str], token sequence with boundry tokens
+            offset_s1 (optional): int, index offset of s1
         """
         raise NotImplementedError
 
     def forward(self, sent, task_name):
-        """ Run pytorch_transformers model and return output representation 
+        """ Run pytorch_transformers model and return output representation
         This function should be implmented in subclasses.
-        
+
         args:
-            sent: batch dictionary, in which 
+            sent: batch dictionary, in which
                 sent[self.tokenizer_required]: <long> [batch_size, var_seq_len] input token IDs
             task_name: task_name string, this can used to implement different mixing scalars for
                 differnt tasks. See the TODO in parameter_setup for more details.
@@ -235,7 +240,7 @@ def get_pretrained_lm_head(self):
         weight to the input token embedding. In most cases, this module needs to work with
         output_mode as "top" or "none"
         This function should be implmented in subclasses.
-        
+
         returns:
             lm_head: module [*, hidden_size] -> [*, vocab_size]
         """
@@ -265,12 +270,17 @@ def __init__(self, args):
         self.parameter_setup(args)
 
     @staticmethod
-    def apply_boundary_tokens(s1, s2=None):
+    def apply_boundary_tokens(s1, s2=None, get_offset=False):
         # BERT-style boundary token padding on string token sequences
         if s2:
-            return ["[CLS]"] + s1 + ["[SEP]"] + s2 + ["[SEP]"]
+            s = ["[CLS]"] + s1 + ["[SEP]"] + s2 + ["[SEP]"]
+            if get_offset:
+                return s, 1, len(s1) + 2
         else:
-            return ["[CLS]"] + s1 + ["[SEP]"]
+            s = ["[CLS]"] + s1 + ["[SEP]"]
+            if get_offset:
+                return s, 1
+        return s
 
     def forward(self, sent: Dict[str, torch.LongTensor], task_name: str = "") -> torch.FloatTensor:
         ids, input_mask = self.correct_sent_indexing(sent)
@@ -317,12 +327,17 @@ def __init__(self, args):
         self.parameter_setup(args)
 
     @staticmethod
-    def apply_boundary_tokens(s1, s2=None):
+    def apply_boundary_tokens(s1, s2=None, get_offset=False):
         # RoBERTa-style boundary token padding on string token sequences
         if s2:
-            return ["<s>"] + s1 + ["</s>", "</s>"] + s2 + ["</s>"]
+            s = ["<s>"] + s1 + ["</s>", "</s>"] + s2 + ["</s>"]
+            if get_offset:
+                return s, 1, len(s1) + 3
         else:
-            return ["<s>"] + s1 + ["</s>"]
+            s = ["<s>"] + s1 + ["</s>"]
+            if get_offset:
+                return s, 1
+        return s
 
     def forward(self, sent: Dict[str, torch.LongTensor], task_name: str = "") -> torch.FloatTensor:
         ids, input_mask = self.correct_sent_indexing(sent)
@@ -372,12 +387,17 @@ def __init__(self, args):
         self._SEG_ID_SEP = 3
 
     @staticmethod
-    def apply_boundary_tokens(s1, s2=None):
+    def apply_boundary_tokens(s1, s2=None, get_offset=False):
         # XLNet-style boundary token marking on string token sequences
         if s2:
-            return s1 + ["<sep>"] + s2 + ["<sep>", "<cls>"]
+            s = s1 + ["<sep>"] + s2 + ["<sep>", "<cls>"]
+            if get_offset:
+                return s, 0, len(s1) + 1
         else:
-            return s1 + ["<sep>", "<cls>"]
+            s = s1 + ["<sep>", "<cls>"]
+            if get_offset:
+                return s, 0
+        return s
 
     def forward(self, sent: Dict[str, torch.LongTensor], task_name: str = "") -> torch.FloatTensor:
         ids, input_mask = self.correct_sent_indexing(sent)
@@ -425,17 +445,25 @@ def __init__(self, args):
         self.parameter_setup(args)
 
     @staticmethod
-    def apply_boundary_tokens(s1, s2=None):
+    def apply_boundary_tokens(s1, s2=None, get_offset=False):
         # OpenAI-GPT-style boundary token marking on string token sequences
         if s2:
-            return ["<start>"] + s1 + ["<delim>"] + s2 + ["<extract>"]
+            s = ["<start>"] + s1 + ["<delim>"] + s2 + ["<extract>"]
+            if get_offset:
+                return s, 1, len(s1) + 2
         else:
-            return ["<start>"] + s1 + ["<extract>"]
+            s = ["<start>"] + s1 + ["<extract>"]
+            if get_offset:
+                return s, 1
+        return s
 
     @staticmethod
-    def apply_lm_boundary_tokens(s1):
+    def apply_lm_boundary_tokens(s1, get_offset=False):
         # OpenAI-GPT-style boundary token marking on string token sequences for LM tasks
-        return ["\n</w>"] + s1 + ["\n</w>"]
+        s = ["\n</w>"] + s1 + ["\n</w>"]
+        if get_offset:
+            return s, 1
+        return s
 
     def forward(self, sent: Dict[str, torch.LongTensor], task_name: str = "") -> torch.FloatTensor:
         ids, input_mask = self.correct_sent_indexing(sent)
@@ -479,17 +507,25 @@ def __init__(self, args):
         self.parameter_setup(args)
 
     @staticmethod
-    def apply_boundary_tokens(s1, s2=None):
+    def apply_boundary_tokens(s1, s2=None, get_offset=False):
         # GPT-2-style boundary token marking on string token sequences
         if s2:
-            return ["<start>"] + s1 + ["<delim>"] + s2 + ["<extract>"]
+            s = ["<start>"] + s1 + ["<delim>"] + s2 + ["<extract>"]
+            if get_offset:
+                return s, 1, len(s1) + 2
         else:
-            return ["<start>"] + s1 + ["<extract>"]
+            s = ["<start>"] + s1 + ["<extract>"]
+            if get_offset:
+                return s, 1
+        return s
 
     @staticmethod
-    def apply_lm_boundary_tokens(s1):
+    def apply_lm_boundary_tokens(s1, get_offset=False):
         # GPT-2-style boundary token marking on string token sequences for LM tasks
-        return ["<|endoftext|>"] + s1 + ["<|endoftext|>"]
+        s = ["<|endoftext|>"] + s1 + ["<|endoftext|>"]
+        if get_offset:
+            return s, 1
+        return s
 
     def forward(self, sent: Dict[str, torch.LongTensor], task_name: str = "") -> torch.FloatTensor:
         ids, input_mask = self.correct_sent_indexing(sent)
@@ -533,17 +569,25 @@ def __init__(self, args):
         self.parameter_setup(args)
 
     @staticmethod
-    def apply_boundary_tokens(s1, s2=None):
+    def apply_boundary_tokens(s1, s2=None, get_offset=False):
         # TransformerXL-style boundary token marking on string token sequences
         if s2:
-            return ["<start>"] + s1 + ["<delim>"] + s2 + ["<extract>"]
+            s = ["<start>"] + s1 + ["<delim>"] + s2 + ["<extract>"]
+            if get_offset:
+                return s, 1, len(s1) + 2
         else:
-            return ["<start>"] + s1 + ["<extract>"]
+            s = ["<start>"] + s1 + ["<extract>"]
+            if get_offset:
+                return s, 1
+        return s
 
     @staticmethod
-    def apply_lm_boundary_tokens(s1):
+    def apply_lm_boundary_tokens(s1, get_offset=False):
         # TransformerXL-style boundary token marking on string token sequences for LM tasks
-        return ["<\n>"] + s1 + ["<\n>"]
+        s = ["<\n>"] + s1 + ["<\n>"]
+        if get_offset:
+            return s, 1
+        return s
 
     def forward(self, sent: Dict[str, torch.LongTensor], task_name: str = "") -> torch.FloatTensor:
         ids, input_mask = self.correct_sent_indexing(sent)
@@ -592,12 +636,17 @@ def __init__(self, args):
         self.parameter_setup(args)
 
     @staticmethod
-    def apply_boundary_tokens(s1, s2=None):
+    def apply_boundary_tokens(s1, s2=None, get_offset=False):
         # XLM-style boundary token marking on string token sequences
         if s2:
-            return ["</s>"] + s1 + ["</s>"] + s2 + ["</s>"]
+            s = ["</s>"] + s1 + ["</s>"] + s2 + ["</s>"]
+            if get_offset:
+                return s, 1, len(s1) + 2
         else:
-            return ["</s>"] + s1 + ["</s>"]
+            s = ["</s>"] + s1 + ["</s>"]
+            if get_offset:
+                return s, 1, len(s1) + 1
+        return s
 
     def forward(self, sent: Dict[str, torch.LongTensor], task_name: str = "") -> torch.FloatTensor:
         ids, input_mask = self.correct_sent_indexing(sent)
diff --git a/jiant/tasks/tasks.py b/jiant/tasks/tasks.py
@@ -2449,7 +2449,7 @@ def _make_span_field(self, s, text_field, offset=1):
     def make_instance(self, record, idx, indexers, model_preprocessing_interface) -> Type[Instance]:
         """Convert a single record to an AllenNLP Instance."""
         tokens = record["text"].split()
-        tokens = model_preprocessing_interface.boundary_token_fn(tokens)
+        tokens, offset = model_preprocessing_interface.boundary_token_fn(tokens, get_offset=True)
         text_field = sentence_to_text_field(tokens, indexers)
 
         example = {}
@@ -2459,7 +2459,7 @@ def make_instance(self, record, idx, indexers, model_preprocessing_interface) ->
 
         for i in range(self.num_spans):
             example["span" + str(i + 1) + "s"] = ListField(
-                [self._make_span_field(record["target"]["span" + str(i + 1)], text_field, 1)]
+                [self._make_span_field(record["target"]["span" + str(i + 1)], text_field, offset)]
             )
         example["labels"] = LabelField(
             record["label"], label_namespace="labels", skip_indexing=True
@@ -2657,18 +2657,25 @@ def _make_instance(input1, input2, idxs1, idxs2, labels, idx):
             d["sent1_str"] = MetadataField(" ".join(input1))
             d["sent2_str"] = MetadataField(" ".join(input2))
             if model_preprocessing_interface.model_flags["uses_pair_embedding"]:
-                inp = model_preprocessing_interface.boundary_token_fn(input1, input2)
+                inp, offset1, offset2 = model_preprocessing_interface.boundary_token_fn(
+                    input1, input2, get_offset=True
+                )
                 d["inputs"] = sentence_to_text_field(inp, indexers)
-                idxs2 = (idxs2[0] + len(input1), idxs2[1] + len(input1))
             else:
-                d["input1"] = sentence_to_text_field(
-                    model_preprocessing_interface.boundary_token_fn(input1), indexers
+                inp1, offset1 = model_preprocessing_interface.boundary_token_fn(
+                    input1, get_offset=True
                 )
-                d["input2"] = sentence_to_text_field(
-                    model_preprocessing_interface.boundary_token_fn(input2), indexers
+                inp2, offset2 = model_preprocessing_interface.boundary_token_fn(
+                    input2, get_offset=True
                 )
-            d["idx1"] = ListField([NumericField(i) for i in range(idxs1[0], idxs1[1])])
-            d["idx2"] = ListField([NumericField(i) for i in range(idxs2[0], idxs2[1])])
+                d["input1"] = sentence_to_text_field(inp1, indexers)
+                d["input2"] = sentence_to_text_field(inp2, indexers)
+            d["idx1"] = ListField(
+                [NumericField(i) for i in range(idxs1[0] + offset1, idxs1[1] + offset1)]
+            )
+            d["idx2"] = ListField(
+                [NumericField(i) for i in range(idxs2[0] + offset2, idxs2[1] + offset2)]
+            )
             d["labels"] = LabelField(labels, label_namespace="labels", skip_indexing=True)
             d["idx"] = LabelField(idx, label_namespace="idxs_tags", skip_indexing=True)