Skip to content

Commit cda8323

Browse files
chg: Move prodigy functions to prodigy module
1 parent 6c74346 commit cda8323

File tree

4 files changed

+113
-1
lines changed

4 files changed

+113
-1
lines changed
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
2+
def labels_to_prodigy(tokens, labels):
3+
"""
4+
Converts a list of tokens and labels like those used by Rodrigues et al,
5+
and converts to prodigy format dicts.
6+
7+
Args:
8+
tokens (list): A list of tokens.
9+
labels (list): A list of labels relating to `tokens`.
10+
11+
Returns:
12+
A list of prodigy format dicts containing annotated data.
13+
"""
14+
15+
prodigy_data = []
16+
17+
all_token_index = 0
18+
19+
for line_index, line in enumerate(tokens):
20+
prodigy_example = {}
21+
22+
tokens = []
23+
spans = []
24+
token_start_offset = 0
25+
26+
for token_index, token in enumerate(line):
27+
28+
token_end_offset = token_start_offset + len(token)
29+
30+
tokens.append(
31+
{
32+
"text": token,
33+
"id": token_index,
34+
"start": token_start_offset,
35+
"end": token_end_offset,
36+
}
37+
)
38+
39+
spans.append(
40+
{
41+
"label": labels[line_index][token_index : token_index + 1][0],
42+
"start": token_start_offset,
43+
"end": token_end_offset,
44+
"token_start": token_index,
45+
"token_end": token_index,
46+
}
47+
)
48+
49+
prodigy_example["text"] = " ".join(line)
50+
prodigy_example["tokens"] = tokens
51+
prodigy_example["spans"] = spans
52+
prodigy_example["meta"] = {"line": line_index}
53+
54+
token_start_offset = token_end_offset + 1
55+
56+
prodigy_data.append(prodigy_example)
57+
58+
return prodigy_data

deep_reference_parser/prodigy/misc.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import spacy
2+
3+
def _join_prodigy_tokens(text):
4+
"""Return all prodigy tokens in a single string
5+
"""
6+
7+
return "\n".join([str(i) for i in text])
8+
9+
def prodigy_to_conll(docs):
10+
"""
11+
Expect list of jsons loaded from a jsonl
12+
"""
13+
14+
nlp = spacy.load("en_core_web_sm")
15+
texts = [doc["text"] for doc in docs]
16+
docs = list(nlp.tokenizer.pipe(texts))
17+
18+
out = [_join_prodigy_tokens(i) for i in docs]
19+
20+
out_str = "DOCSTART\n\n" + "\n\n".join(out)
21+
22+
return out_str
23+
24+
25+
def prodigy_to_lists(docs):
26+
"""
27+
Expect list of jsons loaded from a jsonl
28+
"""
29+
30+
nlp = spacy.load("en_core_web_sm")
31+
texts = [doc["text"] for doc in docs]
32+
docs = list(nlp.tokenizer.pipe(texts))
33+
34+
out = [[str(token) for token in doc] for doc in docs]
35+
36+
return out

tests/test_labels_to_prodigy.py renamed to tests/prodigy/test_labels_to_prodigy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python3
22
# coding: utf-8
33

4-
from deep_reference_parser.reference_utils import labels_to_prodigy
4+
from deep_reference_parser.prodigy import labels_to_prodigy
55

66

77
def test_labels_to_prodigy():

tests/prodigy/test_misc.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from deep_reference_parser.prodigy import prodigy_to_conll
2+
3+
def test_prodigy_to_conll():
4+
5+
before = [
6+
{"text": "References",},
7+
{"text": "37. No single case of malaria reported in"},
8+
{
9+
"text": "an essential requirement for the correct labelling of potency for therapeutic"
10+
},
11+
{"text": "EQAS, quality control for STI"},
12+
]
13+
14+
after = "DOCSTART\n\nReferences\n\n37\n.\nNo\nsingle\ncase\nof\nmalaria\nreported\nin\n\nan\nessential\nrequirement\nfor\nthe\ncorrect\nlabelling\nof\npotency\nfor\ntherapeutic\n\nEQAS\n,\nquality\ncontrol\nfor\nSTI"
15+
16+
out = prodigy_to_conll(before)
17+
18+
assert after == out

0 commit comments

Comments
 (0)