Skip to content

Commit 7f49660

Browse files
authored
Merge pull request #240 from freelawproject/simplify-method
Simplify get citation call for markup
2 parents ba84b13 + ba78e8e commit 7f49660

File tree

7 files changed

+142
-101
lines changed

7 files changed

+142
-101
lines changed

CHANGES.md

+4
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,13 @@
55
The following changes are not yet released, but are code complete:
66

77
Features:
8+
- Introduced `Document` object to encapsulate plain text, markup text, span updates, tokens, and citation strings.
9+
- Simplifies citation processing by reducing parameter passing and improving maintainability (hopefully).
10+
- Should enable more complex html parsing.
811
- Adds support for years preceding citations
912

1013
Changes:
14+
- Moved text cleaning logic into `get_citations` for simpler call with markup
1115
- Simplifies is parallel logic
1216
- moves is parallel citation to full case citation
1317

benchmark/benchmark.py

+9-10
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@
88
import sys
99
from io import StringIO
1010
from pathlib import Path
11+
from typing import Any, Dict
1112

1213
from matplotlib import pyplot as plt # type: ignore
1314

14-
from eyecite import clean_text, get_citations
15+
from eyecite import get_citations
1516

1617
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
1718
sys.path.append(os.path.dirname(SCRIPT_DIR))
@@ -45,26 +46,24 @@ def generate_branch_report(self, branch: str) -> None:
4546
now = datetime.datetime.now()
4647
data = []
4748
for row in csv_data:
48-
text = (
49+
text: str = (
4950
row["xml_harvard"]
5051
or row["html_lawbox"]
5152
or row["html_columbia"]
5253
or row["html_anon_2020"]
5354
or row["html"]
5455
)
56+
params: Dict[str, Any] = {
57+
"clean_steps": ["html", "inline_whitespace"]
58+
}
5559
if text:
5660
# Remove XML encodings from xml_harvard
5761
text = re.sub(r"^<\?xml.*?\?>", "", text, count=1)
58-
opinion_text_is_marked_up = True
62+
params["markup_text"] = text or ""
5963
else:
60-
text = row["plain_text"]
61-
opinion_text_is_marked_up = False
64+
params["markup_text"] = row["plain_text"]
6265

63-
plain_text = clean_text(text, ["html", "inline_whitespace"])
64-
found_citations = get_citations(
65-
plain_text,
66-
markup_text=text if opinion_text_is_marked_up else "",
67-
)
66+
found_citations = get_citations(**params)
6867

6968
# Get the citation text string from the cite object
7069
cites = [cite.token.data for cite in found_citations if cite.token]

eyecite/find.py

+45-66
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import re
22
from bisect import bisect_left, bisect_right
3-
from typing import List, Optional, Type, cast
3+
from typing import Callable, Iterable, List, Optional, Type, Union, cast
44

5-
from eyecite.annotate import SpanUpdater
65
from eyecite.helpers import (
76
disambiguate_reporters,
87
extract_pin_cite,
@@ -14,6 +13,7 @@
1413
CaseReferenceToken,
1514
CitationBase,
1615
CitationToken,
16+
Document,
1717
FullCaseCitation,
1818
FullCitation,
1919
FullJournalCitation,
@@ -35,15 +35,16 @@
3535

3636

3737
def get_citations(
38-
plain_text: str,
38+
plain_text: str = "",
3939
remove_ambiguous: bool = False,
4040
tokenizer: Tokenizer = default_tokenizer,
4141
markup_text: str = "",
42+
clean_steps: Optional[Iterable[Union[str, Callable[[str], str]]]] = None,
4243
) -> List[CitationBase]:
4344
"""This is eyecite's main workhorse function. Given a string of text
44-
(e.g., a judicial opinion or other legal document), return a list of
45+
(e.g., a judicial opinion or other legal doc), return a list of
4546
`eyecite.models.CitationBase` objects representing the citations found
46-
in the document.
47+
in the doc.
4748
4849
Args:
4950
plain_text: The text to parse. You may wish to use the
@@ -57,23 +58,22 @@ def get_citations(
5758
markup_text: if the source text has markup (XML or HTML mostly), pass
5859
it to extract ReferenceCitations that may be detectable via
5960
markup style tags
61+
clean_steps: Cleanup steps and methods
6062
6163
Returns:
6264
A list of `eyecite.models.CitationBase` objects
6365
"""
6466
if plain_text == "eyecite":
6567
return joke_cite
6668

67-
words, citation_tokens = tokenizer.tokenize(plain_text)
69+
document = Document(
70+
plain_text=plain_text,
71+
markup_text=markup_text,
72+
clean_steps=clean_steps,
73+
)
74+
document.tokenize(tokenizer=tokenizer)
6875
citations: list[CitationBase] = []
69-
70-
if markup_text:
71-
plain_to_markup = SpanUpdater(plain_text, markup_text)
72-
markup_to_plain = SpanUpdater(markup_text, plain_text)
73-
else:
74-
plain_to_markup, markup_to_plain = None, None
75-
76-
for i, token in citation_tokens:
76+
for i, token in document.citation_tokens:
7777
citation: CitationBase
7878
token_type = type(token)
7979

@@ -84,9 +84,9 @@ def get_citations(
8484
if token_type is CitationToken:
8585
citation_token = cast(CitationToken, token)
8686
if citation_token.short:
87-
citation = _extract_shortform_citation(words, i)
87+
citation = _extract_shortform_citation(document.words, i)
8888
else:
89-
citation = _extract_full_citation(words, i)
89+
citation = _extract_full_citation(document.words, i)
9090
if (
9191
citations
9292
and isinstance(citation, FullCaseCitation)
@@ -97,28 +97,22 @@ def get_citations(
9797

9898
# Check for reference citations that follow a full citation
9999
# Using the plaintiff or defendant
100-
references = extract_reference_citations(
101-
citation,
102-
plain_text,
103-
markup_text,
104-
plain_to_markup,
105-
markup_to_plain,
106-
)
100+
references = extract_reference_citations(citation, document)
107101
citations.extend(references)
108102

109103
# CASE 2: Token is an "Id." or "Ibid." reference.
110104
# In this case, the citation should simply be to the item cited
111105
# immediately prior, but for safety we will leave that resolution up
112106
# to the user.
113107
elif token_type is IdToken:
114-
citation = _extract_id_citation(words, i)
108+
citation = _extract_id_citation(document.words, i)
115109

116110
# CASE 3: Token is a "supra" reference.
117111
# In this case, we're not sure yet what the citation's antecedent is.
118112
# It could be any of the previous citations above. Thus, like an Id.
119113
# citation, for safety we won't resolve this reference yet.
120114
elif token_type is SupraToken:
121-
citation = _extract_supra_citation(words, i)
115+
citation = _extract_supra_citation(document.words, i)
122116

123117
# CASE 4: Token is a section marker.
124118
# In this case, it's likely that this is a reference to a citation,
@@ -142,48 +136,36 @@ def get_citations(
142136
citations = disambiguate_reporters(citations)
143137

144138
# Returns a list of citations ordered in the sequence that they appear in
145-
# the document. The ordering of this list is important for reconstructing
139+
# the doc. The ordering of this list is important for reconstructing
146140
# the references of the ShortCaseCitation, SupraCitation, and
147141
# IdCitation and ReferenceCitation objects.
148142
return citations
149143

150144

151145
def extract_reference_citations(
152-
citation: FullCitation,
153-
plain_text: str,
154-
markup_text: str = "",
155-
plain_to_markup: Optional[SpanUpdater] = None,
156-
markup_to_plain: Optional[SpanUpdater] = None,
146+
citation: ResourceCitation, document: Document
157147
) -> List[ReferenceCitation]:
158148
"""Extract reference citations that follow a full citation
159149
160150
:param citation: the full case citation found
161-
:param plain_text: the text
162-
:param markup_text: optional argument for source text with XML style tags
163-
that may help extracting name-only ReferenceCitations
164-
:param plain_to_markup: a SpanUpdater from plain or clean text to
165-
marked up text
166-
:param markup_to_plain: a SpanUpdater from marked up text to plain text
151+
:param document: document object to parse
167152
168153
:return: Reference citations
169154
"""
170-
if len(plain_text) <= citation.span()[-1]:
155+
if len(document.plain_text) <= citation.span()[-1]:
171156
return []
172157
if not isinstance(citation, FullCaseCitation):
173158
return []
174159

175160
reference_citations = extract_pincited_reference_citations(
176-
citation, plain_text
161+
citation, document.plain_text
177162
)
178163

179-
if markup_text:
164+
if document.markup_text:
180165
reference_citations.extend(
181166
find_reference_citations_from_markup(
182-
markup_text,
183-
plain_text,
167+
document,
184168
[citation],
185-
plain_to_markup,
186-
markup_to_plain,
187169
)
188170
)
189171

@@ -397,11 +379,8 @@ def _extract_id_citation(
397379

398380

399381
def find_reference_citations_from_markup(
400-
markup_text: str,
401-
plain_text: str,
382+
document: Document,
402383
citations: list,
403-
plain_to_markup: Optional[SpanUpdater] = None,
404-
markup_to_plain: Optional[SpanUpdater] = None,
405384
) -> list[ReferenceCitation]:
406385
"""Use HTML/XML style tags and parties names to find ReferenceCitations
407386
@@ -415,21 +394,12 @@ def find_reference_citations_from_markup(
415394
Creating the SpanUpdaters for each full citation will be too slow,
416395
re-use them if possible
417396
418-
:param markup_text: HTML or XML source
419-
:param plain_text: cleaned text
397+
:param document: Document object we are parsing
420398
:param citations: list of citations found over plain text. The full cites
421399
will be used to access parties names metadata
422-
:param plain_to_markup: a SpanUpdater from plain or clean text to
423-
marked up text
424-
:param markup_to_plain: a SpanUpdater from marked up text to plain text
425400
426401
:return: a list of ReferenceCitations
427402
"""
428-
if not markup_to_plain:
429-
markup_to_plain = SpanUpdater(markup_text, plain_text)
430-
if not plain_to_markup:
431-
plain_to_markup = SpanUpdater(plain_text, markup_text)
432-
433403
references = []
434404
tags = "|".join(["em", "i"])
435405

@@ -458,30 +428,39 @@ def find_reference_citations_from_markup(
458428
# `utils.maybe_balance_style tags` for reference; it has some tolerance
459429
# which may be enough for these citations
460430
regex = rf"<(?:{tags})>\s*({'|'.join(regexes)})[:;.,\s]*</(?:{tags})>"
461-
start_in_markup = plain_to_markup.update(
431+
432+
if (
433+
not document.plain_to_markup
434+
or not document.markup_to_plain
435+
or not document.markup_text
436+
):
437+
# ensure we have markup text
438+
return []
439+
start_in_markup = document.plain_to_markup.update(
462440
citation.span()[0], bisect_right
463441
)
464-
for match in re.finditer(regex, markup_text[start_in_markup:]):
465-
full_start_in_plain = markup_to_plain.update(
442+
for match in re.finditer(
443+
regex, document.markup_text[start_in_markup:]
444+
):
445+
full_start_in_plain = document.markup_to_plain.update(
466446
start_in_markup + match.start(), bisect_left
467447
)
468-
full_end_in_plain = markup_to_plain.update(
448+
full_end_in_plain = document.markup_to_plain.update(
469449
start_in_markup + match.end(), bisect_right
470450
)
471451

472452
# the first group [match.group(0)] is the whole match,
473453
# with whitespace and punctuation. the second group, match.group(1)
474454
# is the only capturing and named group
475-
start_in_plain = markup_to_plain.update(
455+
start_in_plain = document.markup_to_plain.update(
476456
start_in_markup + match.start(1), bisect_left
477457
)
478-
end_in_plain = markup_to_plain.update(
458+
end_in_plain = document.markup_to_plain.update(
479459
start_in_markup + match.end(1), bisect_right
480460
)
481-
482461
reference = ReferenceCitation(
483462
token=CaseReferenceToken(
484-
data=plain_text[start_in_plain:end_in_plain],
463+
data=document.plain_text[start_in_plain:end_in_plain],
485464
start=start_in_plain,
486465
end=end_in_plain,
487466
),

eyecite/models.py

+40
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
Callable,
88
Dict,
99
Hashable,
10+
Iterable,
1011
List,
1112
Optional,
1213
Sequence,
@@ -15,6 +16,8 @@
1516
cast,
1617
)
1718

19+
from eyecite import clean_text
20+
from eyecite.annotate import SpanUpdater
1821
from eyecite.utils import REPORTERS_THAT_NEED_PAGE_CORRECTION, hash_sha256
1922

2023
ResourceType = Hashable
@@ -859,3 +862,40 @@ def __hash__(self):
859862

860863
def __eq__(self, other):
861864
return self.__hash__() == other.__hash__()
865+
866+
867+
@dataclass(eq=False, unsafe_hash=False)
868+
class Document:
869+
plain_text: str = ""
870+
markup_text: Optional[str] = ""
871+
citation_tokens: list[Tuple[int, Token]] = field(default_factory=list)
872+
words: Tokens = field(default_factory=list)
873+
plain_to_markup: Optional[SpanUpdater] = field(default=None, init=False)
874+
markup_to_plain: Optional[SpanUpdater] = field(default=None, init=False)
875+
clean_steps: Optional[Iterable[Union[str, Callable[[str], str]]]] = field(
876+
default_factory=list
877+
)
878+
879+
def __post_init__(self):
880+
if self.plain_text and self.clean_steps:
881+
self.plain_text = clean_text(self.plain_text, self.clean_steps)
882+
883+
if self.markup_text != "":
884+
if "html" not in self.clean_steps:
885+
raise (
886+
"`html` is a required cleanup step for markup text",
887+
self.markup_text,
888+
)
889+
890+
self.plain_text = clean_text(self.markup_text, self.clean_steps)
891+
892+
self.plain_to_markup = SpanUpdater(
893+
self.plain_text, self.markup_text
894+
)
895+
self.markup_to_plain = SpanUpdater(
896+
self.markup_text, self.plain_text
897+
)
898+
899+
def tokenize(self, tokenizer):
900+
# Tokenize the document and store the results in the document object
901+
self.words, self.citation_tokens = tokenizer.tokenize(self.plain_text)

0 commit comments

Comments
 (0)