|
8 | 8 | import json
|
9 | 9 | import os
|
10 | 10 | import pickle
|
| 11 | +import pandas as pd |
11 | 12 |
|
12 | 13 | import spacy
|
13 | 14 |
|
14 | 15 | from .logger import logger
|
15 | 16 |
|
16 | 17 |
|
17 |
| -def load_data(filepath): |
| 18 | +def split_list_by_linebreaks(tokens): |
| 19 | + """Cycle through a list of tokens (or labels) and split them into lists |
| 20 | + based on the presence of Nones or more likely math.nan caused by converting |
| 21 | + pd.DataFrame columns to lists. |
18 | 22 | """
|
19 |
| - Load and return the data stored in the given path. |
20 |
| -
|
21 |
| - Adapted from: https://github.com/dhlab-epfl/LinkedBooksDeepReferenceParsing |
22 |
| -
|
23 |
| - The data is structured as follows: |
24 |
| - * Each line contains four columns separated by a single space. |
25 |
| - * Each word has been put on a separate line and there is an empty line |
26 |
| - after each sentence. |
27 |
| - * The first item on each line is a word, the second, third and fourth are |
28 |
| - tags related to the word. |
29 |
| -
|
30 |
| - Example: |
31 |
| -
|
32 |
| - The sentence "L. Antonielli, Iprefetti dell' Italia napoleonica, Bologna |
33 |
| - 1983." is represented in the dataset as: |
34 |
| -
|
35 |
| - ``` |
36 |
| - L author b-secondary b-r |
37 |
| - . author i-secondary i-r |
38 |
| - Antonielli author i-secondary i-r |
39 |
| - , author i-secondary i-r |
40 |
| - Iprefetti title i-secondary i-r |
41 |
| - dell title i-secondary i-r |
42 |
| - ’ title i-secondary i-r |
43 |
| - Italia title i-secondary i-r |
44 |
| - napoleonica title i-secondary i-r |
45 |
| - , title i-secondary i-r |
46 |
| - Bologna publicationplace i-secondary i-r |
47 |
| - 1983 year e-secondary i-r |
48 |
| - . year e-secondary e-r |
49 |
| - ``` |
50 |
| -
|
51 |
| - Args: |
52 |
| - filepath (str): Path to the data. |
53 |
| -
|
54 |
| - Returns: |
55 |
| - four lists: The first contains tokens, the next three contain |
56 |
| - corresponding labels. |
57 |
| -
|
58 |
| - """ |
59 |
| - |
60 |
| - # Arrays to return |
61 |
| - words = [] |
62 |
| - tags_1 = [] |
63 |
| - tags_2 = [] |
64 |
| - tags_3 = [] |
65 |
| - |
66 |
| - word = tags1 = tags2 = tags3 = [] |
67 |
| - with open(filepath, "r") as file: |
68 |
| - for line in file: |
69 |
| - # Do not take the first line into consideration |
70 |
| - |
71 |
| - if "DOCSTART" not in line: |
72 |
| - # Check if empty line |
73 |
| - |
74 |
| - if line in ["\n", "\r\n"]: |
75 |
| - # Append line |
76 |
| - |
77 |
| - words.append(word) |
78 |
| - tags_1.append(tags1) |
79 |
| - tags_2.append(tags2) |
80 |
| - tags_3.append(tags3) |
81 |
| - |
82 |
| - # Reset |
83 |
| - word = [] |
84 |
| - tags1 = [] |
85 |
| - tags2 = [] |
86 |
| - tags3 = [] |
87 |
| - |
88 |
| - else: |
89 |
| - # Split the line into words, tag #1 |
90 |
| - w = line[:-1].split(" ") |
91 |
| - |
92 |
| - word.append(w[0]) |
93 |
| - tags1.append(w[1]) |
94 |
| - tags2.append(w[2]) |
95 |
| - tags3.append(w[3]) |
96 |
| - |
97 |
| - logger.info("Loaded %s training examples", len(words)) |
98 |
| - |
99 |
| - return words, tags_1, tags_2, tags_3 |
100 |
| - |
| 23 | + out = [] |
| 24 | + tokens_gen = iter(tokens) |
| 25 | + while True: |
| 26 | + try: |
| 27 | + token = next(tokens_gen) |
| 28 | + if isinstance(token, str) and token: |
| 29 | + out.append(token) |
| 30 | + else: |
| 31 | + yield out |
| 32 | + out = [] |
| 33 | + except StopIteration: |
| 34 | + if out: |
| 35 | + yield out |
| 36 | + break |
101 | 37 |
|
102 | 38 | def load_tsv(filepath, split_char="\t"):
|
103 | 39 | """
|
104 | 40 | Load and return the data stored in the given path.
|
105 | 41 |
|
106 |
| - Adapted from: https://github.com/dhlab-epfl/LinkedBooksDeepReferenceParsing |
| 42 | + Expects data in the following format (tab separations). |
| 43 | +
|
| 44 | + References o o |
| 45 | + o o |
| 46 | + 1 o o |
| 47 | + . o o |
| 48 | + o o |
| 49 | + WHO title b-r |
| 50 | + treatment title i-r |
| 51 | + guidelines title i-r |
| 52 | + for title i-r |
| 53 | + drug title i-r |
| 54 | + - title i-r |
| 55 | + resistant title i-r |
| 56 | + tuberculosis title i-r |
| 57 | + , title i-r |
| 58 | + 2016 title i-r |
107 | 59 |
|
108 |
| - NOTE: In the current implementation in deep_reference_parser, only one set |
109 |
| - of tags is used. The others will be used in a later PR. |
110 | 60 |
|
111 |
| - The data is structured as follows: |
112 |
| - * Each line contains four columns separated by a single space. |
113 |
| - * Each word has been put on a separate line and there is an empty line |
114 |
| - after each sentence. |
115 |
| - * The first item on each line is a word, the second, third and fourth are |
116 |
| - tags related to the word. |
117 | 61 |
|
118 | 62 | Args:
|
119 | 63 | filepath (str): Path to the data.
|
120 | 64 | split_char(str): Character to be used to split each line of the
|
121 | 65 | document.
|
122 | 66 |
|
123 | 67 | Returns:
|
124 |
| - two lists: The first contains tokens, the second contains corresponding |
125 |
| - labels. |
| 68 | + a series of lists depending on the number of label columns provided in |
| 69 | + filepath. |
126 | 70 |
|
127 | 71 | """
|
128 | 72 |
|
129 |
| - # Arrays to return |
130 |
| - words = [] |
131 |
| - tags_1 = [] |
132 |
| - |
133 |
| - word = [] |
134 |
| - tags1 = [] |
135 |
| - |
136 |
| - with open(filepath, "r") as file: |
137 |
| - for line in file: |
138 |
| - # Check if empty line |
139 |
| - |
140 |
| - if line in ["\n", "\r\n", "\t\n"]: |
141 |
| - # Append line |
142 |
| - |
143 |
| - words.append(word) |
144 |
| - tags_1.append(tags1) |
145 |
| - |
146 |
| - # Reset |
147 |
| - word = [] |
148 |
| - tags1 = [] |
149 |
| - |
150 |
| - else: |
151 |
| - |
152 |
| - # Split the line into words, tag #1 |
153 |
| - |
154 |
| - w = line[:-1].split(split_char) |
155 |
| - word.append(w[0]) |
156 |
| - |
157 |
| - # If tags are passed, (for training) then also add |
158 |
| - |
159 |
| - if len(w) == 2: |
160 |
| - |
161 |
| - tags1.append(w[1]) |
| 73 | + df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False) |
| 74 | + out = [list(split_list_by_linebreaks(column)) for _, column in df.iteritems()] |
162 | 75 |
|
163 |
| - logger.info("Loaded %s training examples", len(words)) |
| 76 | + logger.info("Loaded %s training examples", len(out[0])) |
164 | 77 |
|
165 |
| - return words, tags_1 |
| 78 | + return tuple(out) |
166 | 79 |
|
167 | 80 |
|
168 | 81 | def prodigy_to_conll(docs):
|
|
0 commit comments