Merge pull request #17 from wellcometrust/feature/ivyleavedtoadflax/refactor_load_tsv

ivyleavedtoadflax · web-flow · commit d8ec4df938e5 · 2020-03-18T08:04:34.000-03:00
Refactor load_tsv to cover multitask case
diff --git a/deep_reference_parser/__init__.py b/deep_reference_parser/__init__.py
@@ -24,7 +24,6 @@
 from .reference_utils import (
     break_into_chunks,
     labels_to_prodigy,
-    load_data,
     load_tsv,
     prodigy_to_conll,
     prodigy_to_lists,
diff --git a/deep_reference_parser/reference_utils.py b/deep_reference_parser/reference_utils.py
@@ -8,161 +8,74 @@
 import json
 import os
 import pickle
+import pandas as pd
 
 import spacy
 
 from .logger import logger
 
 
-def load_data(filepath):
+def split_list_by_linebreaks(tokens):
+    """Cycle through a list of tokens (or labels) and split them into lists
+    based on the presence of Nones or more likely math.nan caused by converting
+    pd.DataFrame columns to lists.
     """
-    Load and return the data stored in the given path.
-
-    Adapted from: https://github.com/dhlab-epfl/LinkedBooksDeepReferenceParsing
-
-    The data is structured as follows:
-     * Each line contains four columns separated by a single space.
-     * Each word has been put on a separate line and there is an empty line
-        after each sentence.
-     * The first item on each line is a word, the second, third and fourth are
-        tags related to the word.
-
-    Example:
-
-    The sentence "L. Antonielli, Iprefetti dell' Italia napoleonica, Bologna
-        1983." is represented in the dataset as:
-
-    ```
-    L author b-secondary b-r
-    . author i-secondary i-r
-    Antonielli author i-secondary i-r
-    , author i-secondary i-r
-    Iprefetti title i-secondary i-r
-    dell title i-secondary i-r
-    ’ title i-secondary i-r
-    Italia title i-secondary i-r
-    napoleonica title i-secondary i-r
-    , title i-secondary i-r
-    Bologna publicationplace i-secondary i-r
-    1983 year e-secondary i-r
-    . year e-secondary e-r
-    ```
-
-    Args:
-        filepath (str): Path to the data.
-
-    Returns:
-        four lists: The first contains tokens, the next three contain
-            corresponding labels.
-
-    """
-
-    # Arrays to return
-    words = []
-    tags_1 = []
-    tags_2 = []
-    tags_3 = []
-
-    word = tags1 = tags2 = tags3 = []
-    with open(filepath, "r") as file:
-        for line in file:
-            # Do not take the first line into consideration
-
-            if "DOCSTART" not in line:
-                # Check if empty line
-
-                if line in ["\n", "\r\n"]:
-                    # Append line
-
-                    words.append(word)
-                    tags_1.append(tags1)
-                    tags_2.append(tags2)
-                    tags_3.append(tags3)
-
-                    # Reset
-                    word = []
-                    tags1 = []
-                    tags2 = []
-                    tags3 = []
-
-                else:
-                    # Split the line into words, tag #1
-                    w = line[:-1].split(" ")
-
-                    word.append(w[0])
-                    tags1.append(w[1])
-                    tags2.append(w[2])
-                    tags3.append(w[3])
-
-    logger.info("Loaded %s training examples", len(words))
-
-    return words, tags_1, tags_2, tags_3
-
+    out = []
+    tokens_gen = iter(tokens)
+    while True:
+        try:
+            token = next(tokens_gen)
+            if isinstance(token, str) and token:
+                out.append(token)
+            else:
+                yield out
+                out = []
+        except StopIteration:
+            if out:
+                yield out
+            break
 
 def load_tsv(filepath, split_char="\t"):
     """
     Load and return the data stored in the given path.
 
-    Adapted from: https://github.com/dhlab-epfl/LinkedBooksDeepReferenceParsing
+    Expects data in the following format (tab separations).
+
+      References   o       o
+                   o       o
+               1   o       o
+               .   o       o
+                   o       o
+             WHO   title   b-r
+       treatment   title   i-r
+      guidelines   title   i-r
+             for   title   i-r
+            drug   title   i-r
+               -   title   i-r
+       resistant   title   i-r
+    tuberculosis   title   i-r
+               ,   title   i-r
+            2016   title   i-r
 
-    NOTE: In the current implementation in deep_reference_parser, only one set
-    of tags is used. The others will be used in a later PR.
 
-    The data is structured as follows:
-     * Each line contains four columns separated by a single space.
-     * Each word has been put on a separate line and there is an empty line
-        after each sentence.
-     * The first item on each line is a word, the second, third and fourth are
-        tags related to the word.
 
     Args:
         filepath (str): Path to the data.
         split_char(str): Character to be used to split each line of the
             document.
 
     Returns:
-        two lists: The first contains tokens, the second contains corresponding
-        labels.
+        a series of lists depending on the number of label columns provided in 
+        filepath.
 
     """
 
-    # Arrays to return
-    words = []
-    tags_1 = []
-
-    word = []
-    tags1 = []
-
-    with open(filepath, "r") as file:
-        for line in file:
-            # Check if empty line
-
-            if line in ["\n", "\r\n", "\t\n"]:
-                # Append line
-
-                words.append(word)
-                tags_1.append(tags1)
-
-                # Reset
-                word = []
-                tags1 = []
-
-            else:
-
-                # Split the line into words, tag #1
-
-                w = line[:-1].split(split_char)
-                word.append(w[0])
-
-                # If tags are passed, (for training) then also add
-
-                if len(w) == 2:
-
-                    tags1.append(w[1])
+    df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False)
+    out = [list(split_list_by_linebreaks(column)) for _, column in df.iteritems()]
 
-    logger.info("Loaded %s training examples", len(words))
+    logger.info("Loaded %s training examples", len(out[0]))
 
-    return words, tags_1
+    return tuple(out)
 
 
 def prodigy_to_conll(docs):
diff --git a/tests/common.py b/tests/common.py
@@ -13,3 +13,4 @@ def get_path(p):
 TEST_REFERENCES = get_path("test_data/test_references.txt")
 TEST_TSV_PREDICT = get_path("test_data/test_tsv_predict.tsv")
 TEST_TSV_TRAIN = get_path("test_data/test_tsv_train.tsv")
+TEST_LOAD_TSV = get_path("test_data/test_load_tsv.tsv")
diff --git a/tests/test_data/test_load_tsv.tsv b/tests/test_data/test_load_tsv.tsv
@@ -0,0 +1,18 @@
+the	i-r	a
+focus	i-r	a
+in	i-r	a
+Daloa	i-r	a
+,	i-r	a
+Côte	i-r	a
+d’Ivoire].	i-r	a
+	
+Bulletin	i-r	a
+de	i-r	a
+la	i-r	a
+Société	i-r	a
+de	i-r	a
+Pathologie	i-r	a
+	
+Exotique	i-r	a
+et	i-r	a
+
diff --git a/tests/test_reference_utils.py b/tests/test_reference_utils.py
@@ -12,9 +12,10 @@
     prodigy_to_conll,
     write_tsv,
     yield_token_label_pairs,
+    split_list_by_linebreaks,
 )
 
-from .common import TEST_TSV_PREDICT, TEST_TSV_TRAIN
+from .common import TEST_TSV_PREDICT, TEST_TSV_TRAIN, TEST_LOAD_TSV
 
 
 def test_prodigy_to_conll():
@@ -75,6 +76,14 @@ def test_load_tsv_train():
 
     actual = load_tsv(TEST_TSV_TRAIN)
 
+    assert len(actual[0][0]) == len(expected[0][0])
+    assert len(actual[0][1]) == len(expected[0][1])
+    assert len(actual[0][2]) == len(expected[0][2])
+
+    assert len(actual[1][0]) == len(expected[1][0])
+    assert len(actual[1][1]) == len(expected[1][1])
+    assert len(actual[1][2]) == len(expected[1][2])
+
     assert actual == expected
 
 
@@ -109,13 +118,59 @@ def test_load_tsv_predict():
             ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
             ["Exotique", "et"],
         ],
-        [[], [], [],],
     )
 
     actual = load_tsv(TEST_TSV_PREDICT)
 
     assert actual == expected
 
+def test_load_tsv_train_multiple_labels():
+    """
+    Text of TEST_TSV_TRAIN:
+
+    ```
+        the	i-r
+        focus	i-r
+        in	i-r
+        Daloa	i-r
+        ,	i-r
+        Côte	i-r
+        d’Ivoire].	i-r
+
+        Bulletin	i-r
+        de	i-r
+        la	i-r
+        Société	i-r
+        de-r
+        Pathologie	i-r
+
+        Exotique	i-r
+        et	i-r
+    ```
+    """
+
+    expected = (
+        [
+            ["the", "focus", "in", "Daloa", ",", "Côte", "d’Ivoire]."],
+            ["Bulletin", "de", "la", "Société", "de", "Pathologie"],
+            ["Exotique", "et"],
+        ],
+        [
+            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
+            ["i-r", "i-r", "i-r", "i-r", "i-r", "i-r"],
+            ["i-r", "i-r"],
+        ],
+        [
+            ["a", "a", "a", "a", "a", "a", "a"],
+            ["a", "a", "a", "a", "a", "a"],
+            ["a", "a"],
+        ],
+    )
+
+    actual = load_tsv(TEST_LOAD_TSV)
+
+    assert actual == expected
+
 
 def test_yield_toke_label_pairs():
 
@@ -197,3 +252,24 @@ def test_break_into_chunks():
     actual = break_into_chunks(before, max_words=2)
 
     assert expected == actual
+
+def test_split_list_by_linebreaks():
+
+    lst = ["a", "b", "c", None, "d"]
+    expected = [["a", "b", "c"], ["d"]]
+
+    actual = split_list_by_linebreaks(lst)
+
+def test_list_by_linebreaks_ending_in_None():
+
+    lst = ["a", "b", "c", float("nan"), "d", None]
+    expected = [["a", "b", "c"], ["d"]]
+
+    actual = split_list_by_linebreaks(lst)
+
+def test_list_by_linebreaks_starting_in_None():
+
+    lst = [None, "a", "b", "c", None, "d"]
+    expected = [["a", "b", "c"], ["d"]]
+
+    actual = split_list_by_linebreaks(lst)