Skip to content

Commit dc40e8d

Browse files
Merge pull request #5 from wellcometrust/feature/ivyleavedtoadflax/prodigy_utilities
Add deep_reference_parser utilities
2 parents 67efc81 + 0855a5c commit dc40e8d

21 files changed

+2071
-6
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ To train your own models you will need to define the model hyperparameters in a
165165
python -m deep_reference_parser train test.ini
166166
```
167167

168-
Data must be prepared in the following tab separated format (tsv). We may publish further tools in the future to assist in the preparation of data following annotation. In this case the data the data for reference span ddetection follows an IOBE schema.
168+
Data must be prepared in the following tab separated format (tsv). We use [prodi.gy](https://prodi.gy) for annotations. Some utilities to help with manual annotations and various format conversions are available in the [prodigy](./prodigy/) module. Data for reference span detection follows an IOBE schema.
169169

170170
You must provide the train/test/validation data splits in this format in pre-prepared files that are defined in the config file.
171171

deep_reference_parser/__main__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99
import plac
1010
import sys
1111
from wasabi import msg
12-
from deep_reference_parser.train import train
13-
from deep_reference_parser.predict import predict
12+
from .train import train
13+
from .predict import predict
1414

1515
commands = {
16-
"train": train,
1716
"predict": predict,
17+
"train": train,
1818
}
1919

2020
if len(sys.argv) == 1:

deep_reference_parser/io/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .io import read_jsonl, write_jsonl

deep_reference_parser/io/io.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/usr/bin/env python3
2+
# coding: utf-8
3+
4+
"""
5+
Utilities for loading and saving data from various formats
6+
"""
7+
8+
import json
9+
10+
from ..logger import logger
11+
12+
13+
def write_jsonl(input_data, output_file):
14+
"""
15+
Write a dict to jsonl (line delimited json)
16+
17+
Output format will look like:
18+
19+
```
20+
{'a': 0}
21+
{'b': 1}
22+
{'c': 2}
23+
{'d': 3}
24+
```
25+
26+
Args:
27+
input_data(dict): A dict to be written to json.
28+
output_file(str): Filename to which the jsonl will be saved.
29+
"""
30+
31+
with open(output_file, 'w') as fb:
32+
33+
# Check if a dict (and convert to list if so)
34+
35+
if isinstance(input_data, dict):
36+
input_data = [value for key, value in input_data.items()]
37+
38+
# Write out to jsonl file
39+
40+
logger.debug('Writing %s lines to %s', len(input_data), output_file)
41+
42+
for i in input_data:
43+
json_ = json.dumps(i) + '\n'
44+
fb.write(json_)
45+
46+
47+
def _yield_jsonl(file_name):
48+
for row in open(file_name, "r"):
49+
yield json.loads(row)
50+
51+
52+
def read_jsonl(input_file):
53+
"""Create a list from a jsonl file
54+
55+
Args:
56+
input_file(str): File to be loaded.
57+
"""
58+
59+
out = list(_yield_jsonl(input_file))
60+
61+
logger.debug('Read %s lines from %s', len(out), input_file)
62+
63+
return out
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Prodigy utilities
2+
3+
The `deep_reference_parser.prodigy` module contains a number of utility functions for working with annotations created in [prodi.gy](http://prodi.gy).
4+
5+
The individual functions can be access with the usual `import deep_reference_parser.prodigy` logic, but can also be accessed on the command line with:
6+
7+
```
8+
$ python -m deep_reference_parser.prodigy
9+
Using TensorFlow backend.
10+
11+
ℹ Available commands
12+
annotate_numbered_refs, prodigy_to_tsv, reach_to_prodigy,
13+
refs_to_token_annotations
14+
```
15+
16+
|Name|Description|
17+
|---|---|
18+
|reach_to_prodigy|Converts a jsonl of reference sections output by reach into a jsonl containing prodigy format documents.|
19+
|annotate_numbered_refs|Takes numbered reference sections extract by Reach, and roughly annotates the references by splitting the reference lines apart on the numbers.|
20+
|prodigy_to_tsv|Converts a jsonl file of prodigy documents to a tab separated values (tsv) file where each token and its associated label occupy a line.|
21+
|refs_to_token_annotations|Takes a jsonl of annotated reference sections in prodigy format that have been manually annotated to the reference level, and converts the references into token level annotations based on the IOBE schema, saving a new file or prodigy documents to jsonl.|
22+
23+
Help for each of these commands can be sought with the `--help` flag, e.g.:
24+
25+
```
26+
$ python -m deep_reference_parser.prodigy prodigy_to_tsv --help
27+
Using TensorFlow backend.
28+
usage: deep_reference_parser prodigy_to_tsv [-h] input_file output_file
29+
30+
Convert token annotated jsonl to token annotated tsv ready for use in the
31+
Rodrigues model.
32+
33+
34+
positional arguments:
35+
input_file Path to jsonl file containing prodigy docs.
36+
output_file Path to output tsv file.
37+
38+
optional arguments:
39+
-h, --help show this help message and exit
40+
41+
```
42+
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .spacy_doc_to_prodigy import SpacyDocToProdigy
2+
from .reference_to_token_annotations import TokenTagger
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# coding: utf8
2+
3+
"""
4+
Modified from https://github.com/explosion/spaCy/blob/master/spacy/__main__.py
5+
6+
"""
7+
8+
if __name__ == "__main__":
9+
import plac
10+
import sys
11+
from wasabi import msg
12+
from .numbered_reference_annotator import annotate_numbered_references
13+
from .prodigy_to_tsv import prodigy_to_tsv
14+
from .reach_to_prodigy import reach_to_prodigy
15+
from .reference_to_token_annotations import reference_to_token_annotations
16+
17+
commands = {
18+
"annotate_numbered_refs": annotate_numbered_references,
19+
"prodigy_to_tsv": prodigy_to_tsv,
20+
"reach_to_prodigy": reach_to_prodigy,
21+
"refs_to_token_annotations": reference_to_token_annotations,
22+
}
23+
24+
if len(sys.argv) == 1:
25+
msg.info("Available commands", ", ".join(commands), exits=1)
26+
command = sys.argv.pop(1)
27+
sys.argv[0] = "deep_reference_parser %s" % command
28+
29+
if command in commands:
30+
plac.call(commands[command], sys.argv[1:])
31+
else:
32+
available = "Available: {}".format(", ".join(commands))
33+
msg.fail("Unknown command: {}".format(command), available, exits=1)
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
# coding: utf-8
2+
#!/usr/bin/env python3
3+
4+
import re
5+
6+
import plac
7+
8+
from ..io import read_jsonl, write_jsonl
9+
from ..logger import logger
10+
11+
REGEX = r"\n{1,2}(?:(?:\s)|(?:\(|\[))?(?:\d{1,2})(?:(?:\.\)|\.\]|\]\n|\.|\s)|(?:\]|\)))(\s+)?(?:\n)?(?:\s+)?(?!Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"
12+
13+
class NumberedReferenceAnnotator:
14+
"""
15+
Takes reference sections with numeric labelling scraped by Reach in prodigy
16+
format, and labels the references as spans by splitting them using regex.
17+
18+
Note that you must identify numbered reference section first. This can be
19+
done with a simple textcat model trained in prodigy.
20+
"""
21+
22+
def __init__(self):
23+
24+
self.regex = r""
25+
26+
def run(self, docs, regex=REGEX):
27+
28+
self.regex = regex
29+
30+
for doc in docs:
31+
32+
spans = self.label_numbered_references(doc["text"], doc["tokens"])
33+
doc["spans"] = spans
34+
35+
yield doc
36+
37+
def label_numbered_references(self, text, tokens):
38+
39+
# Search for number reference using regex
40+
41+
splits = list(re.finditer(self.regex, text))
42+
spans = []
43+
44+
for index in range(0, len(splits) - 1):
45+
46+
# Calculate the approximate start and end of the reference using
47+
# the character offsets returned by re.finditer.
48+
49+
start = splits[index].end()
50+
end = splits[index + 1].start()
51+
52+
# Calculate which is the closest token to the character offset
53+
# returned above.
54+
55+
token_start = self._find_closest_token(tokens, start, "start")
56+
token_end = self._find_closest_token(tokens, end, "end")
57+
58+
# To avoid the possibility of mismatches between the character
59+
# offset and the token offset, reset the character offsets
60+
# based on the token offsets.
61+
62+
start = self._get_token_offset(tokens, token_start, "start")
63+
end = self._get_token_offset(tokens, token_end, "end")
64+
65+
# Create dict and append
66+
67+
span = {
68+
"start": start,
69+
"end": end,
70+
"token_start": token_start,
71+
"token_end": token_end,
72+
"label": "BE"
73+
}
74+
75+
spans.append(span)
76+
77+
return spans
78+
79+
80+
def _find_closest_token(self, tokens, char_offset, pos_string):
81+
"""
82+
Find the token start/end closest to "number"
83+
84+
Args:
85+
tokens: A list of token dicts from a prodigy document.
86+
char_offset(int): A character offset relating to either the start or the
87+
end of a token.
88+
pos_string(str): One of ["start", "end"] denoting whether `char_offset`
89+
is a start or the end of a token
90+
"""
91+
token_map = self._token_start_mapper(tokens, pos_string)
92+
token_key = self._find_closest_number(token_map.keys(), char_offset)
93+
94+
return token_map[token_key]
95+
96+
def _get_token_offset(self, tokens, token_id, pos_string):
97+
"""
98+
Return the character offset for the token with id == token_id
99+
"""
100+
101+
token_match = (token[pos_string] for token in tokens if token["id"] == token_id)
102+
103+
return next(token_match, None)
104+
105+
def _find_closest_number(self, numbers, number):
106+
""" Find the closest match in a list of numbers when presented with
107+
a number
108+
"""
109+
110+
return min(numbers, key=lambda x:abs(x - number))
111+
112+
def _token_start_mapper(self, tokens, pos_string):
113+
""" Map token id by the token start/end position
114+
"""
115+
116+
return {token[pos_string]:token["id"] for token in tokens}
117+
118+
119+
@plac.annotations(
120+
input_file=(
121+
"Path to jsonl file containing numbered reference sections as docs.",
122+
"positional",
123+
None,
124+
str
125+
),
126+
output_file=(
127+
"Path to output jsonl file containing prodigy docs with numbered references labelled.",
128+
"positional",
129+
None,
130+
str
131+
)
132+
)
133+
def annotate_numbered_references(input_file, output_file):
134+
"""
135+
Takes reference sections with numeric labelling scraped by Reach in prodigy
136+
format, and labels the references as spans by splitting them using regex.
137+
"""
138+
139+
numbered_reference_sections = read_jsonl(input_file)
140+
141+
logger.info("Loaded %s prodigy docs", len(numbered_reference_sections))
142+
143+
nra = NumberedReferenceAnnotator()
144+
docs = list(nra.run[numbered_reference_sections])
145+
146+
write_jsonl(output_file)
147+
148+
logger.info("Wrote %s annotated references to %s", len(docs),
149+
output_file)

0 commit comments

Comments
 (0)