Skip to content

Commit ae2fcf2

Browse files
Merge pull request #19 from wellcometrust/reorganise
Reorganise sub-module structure
2 parents d8ec4df + 2e7da2e commit ae2fcf2

15 files changed

+504
-622
lines changed

deep_reference_parser/__init__.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
# distracting on the command line. These lines here (while undesireable)
33
# reduce the level of verbosity.
44

5+
import os
56
import sys
67
import warnings
7-
import os
88

99
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
1010

@@ -19,20 +19,16 @@
1919

2020
from .common import download_model_artefact
2121
from .deep_reference_parser import DeepReferenceParser
22-
from .logger import logger
23-
from .model_utils import get_config
24-
from .reference_utils import (
25-
break_into_chunks,
26-
labels_to_prodigy,
22+
from .io import (
2723
load_tsv,
28-
prodigy_to_conll,
29-
prodigy_to_lists,
3024
read_jsonl,
3125
read_pickle,
32-
write_json,
3326
write_jsonl,
3427
write_pickle,
3528
write_to_csv,
36-
write_txt,
29+
write_tsv,
3730
)
31+
from .logger import logger
32+
from .model_utils import get_config
33+
from .reference_utils import break_into_chunks
3834
from .tokens_to_references import tokens_to_references

deep_reference_parser/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
__name__ = "deep_reference_parser"
2-
__version__ = "2020.3.0"
2+
__version__ = "2020.3.1"
33
__description__ = "Deep learning model for finding and parsing references"
44
__url__ = "https://github.com/wellcometrust/deep_reference_parser"
55
__author__ = "Wellcome Trust DataLabs Team"

deep_reference_parser/deep_reference_parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
save_confusion_matrix,
4848
word2vec_embeddings,
4949
)
50-
from .reference_utils import load_tsv, read_pickle, write_pickle, write_to_csv
50+
from .io import load_tsv, read_pickle, write_pickle, write_to_csv
5151

5252

5353
class DeepReferenceParser:
@@ -456,7 +456,7 @@ def build_model(
456456

457457
self.model = model
458458

459-
logger.debug(self.model.summary(line_length=150))
459+
# logger.debug(self.model.summary(line_length=150))
460460

461461
def train_model(
462462
self, epochs=25, batch_size=100, early_stopping_patience=5, metric="val_f1"

deep_reference_parser/io/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
from .io import read_jsonl, write_jsonl
1+
from .io import (load_tsv, read_jsonl, read_pickle, write_jsonl, write_pickle,
2+
write_to_csv, write_tsv)

deep_reference_parser/io/io.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,74 @@
66
"""
77

88
import json
9+
import pickle
10+
import csv
11+
import os
12+
import pandas as pd
913

1014
from ..logger import logger
1115

16+
def _split_list_by_linebreaks(tokens):
17+
"""Cycle through a list of tokens (or labels) and split them into lists
18+
based on the presence of Nones or more likely math.nan caused by converting
19+
pd.DataFrame columns to lists.
20+
"""
21+
out = []
22+
tokens_gen = iter(tokens)
23+
while True:
24+
try:
25+
token = next(tokens_gen)
26+
if isinstance(token, str) and token:
27+
out.append(token)
28+
else:
29+
yield out
30+
out = []
31+
except StopIteration:
32+
if out:
33+
yield out
34+
break
35+
36+
def load_tsv(filepath, split_char="\t"):
37+
"""
38+
Load and return the data stored in the given path.
39+
40+
Expects data in the following format (tab separations).
41+
42+
References o o
43+
o o
44+
1 o o
45+
. o o
46+
o o
47+
WHO title b-r
48+
treatment title i-r
49+
guidelines title i-r
50+
for title i-r
51+
drug title i-r
52+
- title i-r
53+
resistant title i-r
54+
tuberculosis title i-r
55+
, title i-r
56+
2016 title i-r
57+
58+
59+
60+
Args:
61+
filepath (str): Path to the data.
62+
split_char(str): Character to be used to split each line of the
63+
document.
64+
65+
Returns:
66+
a series of lists depending on the number of label columns provided in
67+
filepath.
68+
69+
"""
70+
71+
df = pd.read_csv(filepath, delimiter=split_char, header=None, skip_blank_lines=False)
72+
out = [list(_split_list_by_linebreaks(column)) for _, column in df.iteritems()]
73+
74+
logger.info("Loaded %s training examples", len(out[0]))
75+
76+
return tuple(out)
1277

1378
def write_jsonl(input_data, output_file):
1479
"""
@@ -61,3 +126,71 @@ def read_jsonl(input_file):
61126
logger.debug("Read %s lines from %s", len(out), input_file)
62127

63128
return out
129+
130+
131+
def write_to_csv(filename, columns, rows):
132+
"""
133+
Create a .csv file from data given as columns and rows
134+
135+
Args:
136+
filename(str): Path and name of the .csv file, without csv extension
137+
columns(list): Columns of the csv file (First row of the file)
138+
rows: Data to write into the csv file, given per row
139+
"""
140+
141+
with open(filename, "w") as csvfile:
142+
wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
143+
wr.writerow(columns)
144+
145+
for i, row in enumerate(rows):
146+
wr.writerow(row)
147+
logger.info("Wrote results to %s", filename)
148+
149+
150+
def write_pickle(input_data, output_file, path=None):
151+
"""
152+
Write an object to pickle
153+
154+
Args:
155+
input_data(dict): A dict to be written to json.
156+
output_file(str): A filename or path to which the json will be saved.
157+
path(str): A string which will be prepended onto `output_file` with
158+
`os.path.join()`. Obviates the need for lengthy `os.path.join`
159+
statements each time this function is called.
160+
"""
161+
162+
if path:
163+
164+
output_file = os.path.join(path, output_file)
165+
166+
with open(output_file, "wb") as fb:
167+
pickle.dump(input_data, fb)
168+
169+
170+
def read_pickle(input_file, path=None):
171+
"""Create a list from a jsonl file
172+
173+
Args:
174+
input_file(str): File to be loaded.
175+
path(str): A string which will be prepended onto `input_file` with
176+
`os.path.join()`. Obviates the need for lengthy `os.path.join`
177+
statements each time this function is called.
178+
"""
179+
180+
if path:
181+
input_file = os.path.join(path, input_file)
182+
183+
with open(input_file, "rb") as fb:
184+
out = pickle.load(fb)
185+
186+
logger.debug("Read data from %s", input_file)
187+
188+
return out
189+
190+
def write_tsv(token_label_pairs, output_path):
191+
"""
192+
Write tsv files to disk
193+
"""
194+
with open(output_path, "w") as fb:
195+
writer = csv.writer(fb, delimiter="\t")
196+
writer.writerows(token_label_pairs)

deep_reference_parser/prodigy/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,5 @@
66
from .reach_to_prodigy import ReachToProdigy, reach_to_prodigy
77
from .reference_to_token_annotations import TokenTagger, reference_to_token_annotations
88
from .spacy_doc_to_prodigy import SpacyDocToProdigy
9+
from .misc import prodigy_to_conll
10+
from .labels_to_prodigy import labels_to_prodigy
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
def labels_to_prodigy(tokens, labels):
2+
"""
3+
Converts a list of tokens and labels like those used by Rodrigues et al,
4+
and converts to prodigy format dicts.
5+
6+
Args:
7+
tokens (list): A list of tokens.
8+
labels (list): A list of labels relating to `tokens`.
9+
10+
Returns:
11+
A list of prodigy format dicts containing annotated data.
12+
"""
13+
14+
prodigy_data = []
15+
16+
all_token_index = 0
17+
18+
for line_index, line in enumerate(tokens):
19+
prodigy_example = {}
20+
21+
tokens = []
22+
spans = []
23+
token_start_offset = 0
24+
25+
for token_index, token in enumerate(line):
26+
27+
token_end_offset = token_start_offset + len(token)
28+
29+
tokens.append(
30+
{
31+
"text": token,
32+
"id": token_index,
33+
"start": token_start_offset,
34+
"end": token_end_offset,
35+
}
36+
)
37+
38+
spans.append(
39+
{
40+
"label": labels[line_index][token_index : token_index + 1][0],
41+
"start": token_start_offset,
42+
"end": token_end_offset,
43+
"token_start": token_index,
44+
"token_end": token_index,
45+
}
46+
)
47+
48+
prodigy_example["text"] = " ".join(line)
49+
prodigy_example["tokens"] = tokens
50+
prodigy_example["spans"] = spans
51+
prodigy_example["meta"] = {"line": line_index}
52+
53+
token_start_offset = token_end_offset + 1
54+
55+
prodigy_data.append(prodigy_example)
56+
57+
return prodigy_data

deep_reference_parser/prodigy/misc.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import spacy
2+
3+
4+
def _join_prodigy_tokens(text):
5+
"""Return all prodigy tokens in a single string
6+
"""
7+
8+
return "\n".join([str(i) for i in text])
9+
10+
11+
def prodigy_to_conll(docs):
12+
"""
13+
Expect list of jsons loaded from a jsonl
14+
"""
15+
16+
nlp = spacy.load("en_core_web_sm")
17+
texts = [doc["text"] for doc in docs]
18+
docs = list(nlp.tokenizer.pipe(texts))
19+
20+
out = [_join_prodigy_tokens(i) for i in docs]
21+
22+
out_str = "DOCSTART\n\n" + "\n\n".join(out)
23+
24+
return out_str
25+
26+
27+
def prodigy_to_lists(docs):
28+
"""
29+
Expect list of jsons loaded from a jsonl
30+
"""
31+
32+
nlp = spacy.load("en_core_web_sm")
33+
texts = [doc["text"] for doc in docs]
34+
docs = list(nlp.tokenizer.pipe(texts))
35+
36+
out = [[str(token) for token in doc] for doc in docs]
37+
38+
return out

deep_reference_parser/prodigy/prodigy_to_tsv.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
msg = Printer()
2222

23-
ROWS_TO_PRINT=15
23+
ROWS_TO_PRINT = 15
2424

2525

2626
class TokenLabelPairs:
@@ -375,8 +375,6 @@ def prodigy_to_tsv(
375375

376376
with open(output_file, "w") as fb:
377377
writer = csv.writer(fb, delimiter="\t")
378-
# Write DOCSTART and a blank line
379-
# writer.writerows([("DOCSTART", None), (None, None)])
380378
writer.writerows(merged_pairs)
381379

382380
# Print out the first ten rows as a sense check

0 commit comments

Comments
 (0)