|
6 | 6 | """
|
7 | 7 |
|
8 | 8 | import json
|
| 9 | +import pickle |
| 10 | +import csv |
| 11 | +import os |
| 12 | +import pandas as pd |
9 | 13 |
|
10 | 14 | from ..logger import logger
|
11 | 15 |
|
| 16 | +def _split_list_by_linebreaks(tokens): |
| 17 | + """Cycle through a list of tokens (or labels) and split them into lists |
| 18 | + based on the presence of Nones or more likely math.nan caused by converting |
| 19 | + pd.DataFrame columns to lists. |
| 20 | + """ |
| 21 | + out = [] |
| 22 | + tokens_gen = iter(tokens) |
| 23 | + while True: |
| 24 | + try: |
| 25 | + token = next(tokens_gen) |
| 26 | + if isinstance(token, str) and token: |
| 27 | + out.append(token) |
| 28 | + else: |
| 29 | + yield out |
| 30 | + out = [] |
| 31 | + except StopIteration: |
| 32 | + if out: |
| 33 | + yield out |
| 34 | + break |
| 35 | + |
| 36 | +def load_tsv(filepath, split_char="\t"): |
| 37 | + """ |
| 38 | + Load and return the data stored in the given path. |
| 39 | +
|
| 40 | + Expects data in the following format (tab separations). |
| 41 | +
|
| 42 | + References o o |
| 43 | + o o |
| 44 | + 1 o o |
| 45 | + . o o |
| 46 | + o o |
| 47 | + WHO title b-r |
| 48 | + treatment title i-r |
| 49 | + guidelines title i-r |
| 50 | + for title i-r |
| 51 | + drug title i-r |
| 52 | + - title i-r |
| 53 | + resistant title i-r |
| 54 | + tuberculosis title i-r |
| 55 | + , title i-r |
| 56 | + 2016 title i-r |
| 57 | +
|
| 58 | +
|
| 59 | +
|
| 60 | + Args: |
| 61 | + filepath (str): Path to the data. |
| 62 | + split_char(str): Character to be used to split each line of the |
| 63 | + document. |
| 64 | +
|
| 65 | + Returns: |
| 66 | + a series of lists depending on the number of label columns provided in |
| 67 | + filepath. |
| 68 | +
|
| 69 | + """ |
| 70 | + |
| 71 | + df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False) |
| 72 | + out = [list(_split_list_by_linebreaks(column)) for _, column in df.iteritems()] |
| 73 | + |
| 74 | + logger.info("Loaded %s training examples", len(out[0])) |
| 75 | + |
| 76 | + return tuple(out) |
12 | 77 |
|
13 | 78 | def write_jsonl(input_data, output_file):
|
14 | 79 | """
|
@@ -61,3 +126,71 @@ def read_jsonl(input_file):
|
61 | 126 | logger.debug("Read %s lines from %s", len(out), input_file)
|
62 | 127 |
|
63 | 128 | return out
|
| 129 | + |
| 130 | + |
| 131 | +def write_to_csv(filename, columns, rows): |
| 132 | + """ |
| 133 | + Create a .csv file from data given as columns and rows |
| 134 | +
|
| 135 | + Args: |
| 136 | + filename(str): Path and name of the .csv file, without csv extension |
| 137 | + columns(list): Columns of the csv file (First row of the file) |
| 138 | + rows: Data to write into the csv file, given per row |
| 139 | + """ |
| 140 | + |
| 141 | + with open(filename, "w") as csvfile: |
| 142 | + wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL) |
| 143 | + wr.writerow(columns) |
| 144 | + |
| 145 | + for i, row in enumerate(rows): |
| 146 | + wr.writerow(row) |
| 147 | + logger.info("Wrote results to %s", filename) |
| 148 | + |
| 149 | + |
| 150 | +def write_pickle(input_data, output_file, path=None): |
| 151 | + """ |
| 152 | + Write an object to pickle |
| 153 | +
|
| 154 | + Args: |
| 155 | + input_data(dict): A dict to be written to json. |
| 156 | + output_file(str): A filename or path to which the json will be saved. |
| 157 | + path(str): A string which will be prepended onto `output_file` with |
| 158 | + `os.path.join()`. Obviates the need for lengthy `os.path.join` |
| 159 | + statements each time this function is called. |
| 160 | + """ |
| 161 | + |
| 162 | + if path: |
| 163 | + |
| 164 | + output_file = os.path.join(path, output_file) |
| 165 | + |
| 166 | + with open(output_file, "wb") as fb: |
| 167 | + pickle.dump(input_data, fb) |
| 168 | + |
| 169 | + |
| 170 | +def read_pickle(input_file, path=None): |
| 171 | + """Create a list from a jsonl file |
| 172 | +
|
| 173 | + Args: |
| 174 | + input_file(str): File to be loaded. |
| 175 | + path(str): A string which will be prepended onto `input_file` with |
| 176 | + `os.path.join()`. Obviates the need for lengthy `os.path.join` |
| 177 | + statements each time this function is called. |
| 178 | + """ |
| 179 | + |
| 180 | + if path: |
| 181 | + input_file = os.path.join(path, input_file) |
| 182 | + |
| 183 | + with open(input_file, "rb") as fb: |
| 184 | + out = pickle.load(fb) |
| 185 | + |
| 186 | + logger.debug("Read data from %s", input_file) |
| 187 | + |
| 188 | + return out |
| 189 | + |
| 190 | +def write_tsv(token_label_pairs, output_path): |
| 191 | + """ |
| 192 | + Write tsv files to disk |
| 193 | + """ |
| 194 | + with open(output_path, "w") as fb: |
| 195 | + writer = csv.writer(fb, delimiter="\t") |
| 196 | + writer.writerows(token_label_pairs) |
0 commit comments