wellcometrust
diff --git a/‎deep_reference_parser/io/__init__.py
Lines changed: 2 additions & 1 deletion b/‎deep_reference_parser/io/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎deep_reference_parser/io/io.py
Lines changed: 133 additions & 0 deletions b/‎deep_reference_parser/io/io.py
Lines changed: 133 additions & 0 deletions
@@ -1 +1,2 @@
-from .io import read_jsonl, write_jsonl
+from .io import (load_tsv, read_jsonl, read_pickle, write_jsonl, write_pickle,
+                 write_to_csv, write_tsv)
@@ -6,9 +6,74 @@
 """
 
 import json
+import pickle
+import csv
+import os
+import pandas as pd
 
 from ..logger import logger
 
+def _split_list_by_linebreaks(tokens):
+    """Cycle through a list of tokens (or labels) and split them into lists
+    based on the presence of Nones or more likely math.nan caused by converting
+    pd.DataFrame columns to lists.
+    """
+    out = []
+    tokens_gen = iter(tokens)
+    while True:
+        try:
+            token = next(tokens_gen)
+            if isinstance(token, str) and token:
+                out.append(token)
+            else:
+                yield out
+                out = []
+        except StopIteration:
+            if out:
+                yield out
+            break
+
+def load_tsv(filepath, split_char="\t"):
+    """
+    Load and return the data stored in the given path.
+
+    Expects data in the following format (tab separations).
+
+      References   o       o
+                   o       o
+               1   o       o
+               .   o       o
+                   o       o
+             WHO   title   b-r
+       treatment   title   i-r
+      guidelines   title   i-r
+             for   title   i-r
+            drug   title   i-r
+               -   title   i-r
+       resistant   title   i-r
+    tuberculosis   title   i-r
+               ,   title   i-r
+            2016   title   i-r
+
+
+
+    Args:
+        filepath (str): Path to the data.
+        split_char(str): Character to be used to split each line of the
+            document.
+
+    Returns:
+        a series of lists depending on the number of label columns provided in 
+        filepath.
+
+    """
+
+    df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False)
+    out = [list(_split_list_by_linebreaks(column)) for _, column in df.iteritems()]
+
+    logger.info("Loaded %s training examples", len(out[0]))
+
+    return tuple(out)
 
 def write_jsonl(input_data, output_file):
     """
@@ -61,3 +126,71 @@ def read_jsonl(input_file):
     logger.debug("Read %s lines from %s", len(out), input_file)
 
     return out
+
+
+def write_to_csv(filename, columns, rows):
+    """
+    Create a .csv file from data given as columns and rows
+
+    Args:
+        filename(str): Path and name of the .csv file, without csv extension
+        columns(list): Columns of the csv file (First row of the file)
+        rows: Data to write into the csv file, given per row
+    """
+
+    with open(filename, "w") as csvfile:
+        wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
+        wr.writerow(columns)
+
+        for i, row in enumerate(rows):
+            wr.writerow(row)
+    logger.info("Wrote results to %s", filename)
+
+
+def write_pickle(input_data, output_file, path=None):
+    """
+    Write an object to pickle
+
+    Args:
+        input_data(dict): A dict to be written to json.
+        output_file(str): A filename or path to which the json will be saved.
+        path(str): A string which will be prepended onto `output_file` with
+            `os.path.join()`. Obviates the need for lengthy `os.path.join`
+            statements each time this function is called.
+    """
+
+    if path:
+
+        output_file = os.path.join(path, output_file)
+
+    with open(output_file, "wb") as fb:
+        pickle.dump(input_data, fb)
+
+
+def read_pickle(input_file, path=None):
+    """Create a list from a jsonl file
+
+    Args:
+        input_file(str): File to be loaded.
+        path(str): A string which will be prepended onto `input_file` with
+            `os.path.join()`. Obviates the need for lengthy `os.path.join`
+            statements each time this function is called.
+    """
+
+    if path:
+        input_file = os.path.join(path, input_file)
+
+    with open(input_file, "rb") as fb:
+        out = pickle.load(fb)
+
+    logger.debug("Read data from %s", input_file)
+
+    return out
+
+def write_tsv(token_label_pairs, output_path):
+    """
+    Write tsv files to disk
+    """
+    with open(output_path, "w") as fb:
+        writer = csv.writer(fb, delimiter="\t")
+        writer.writerows(token_label_pairs)
Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-from .io import read_jsonl, write_jsonl`
	`1`	`+from .io import (load_tsv, read_jsonl, read_pickle, write_jsonl, write_pickle,`
	`2`	`+ write_to_csv, write_tsv)`