Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify get citation call for markup #240

Merged
merged 7 commits into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@
The following changes are not yet released, but are code complete:

Features:
- None
- Introduced `Document` object to encapsulate plain text, markup text, span updates, tokens, and citation strings.
- Simplifies citation processing by reducing parameter passing and improving maintainability (hopefully).
- Should enable more complex html parsing.

Changes:
- None
- Moved text cleaning logic into `get_citations` for simpler call with markup
-

Fixes:
- Prefer the other full citation on overlap with nominative reporter
Expand Down
111 changes: 47 additions & 64 deletions eyecite/find.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import re
from bisect import bisect_left, bisect_right
from typing import List, Optional, Type, cast
from typing import Callable, Iterable, List, Optional, Type, Union, cast

from eyecite.annotate import SpanUpdater
from eyecite.helpers import (
disambiguate_reporters,
extract_pin_cite,
Expand All @@ -14,6 +13,7 @@
CaseReferenceToken,
CitationBase,
CitationToken,
Document,
FullCaseCitation,
FullCitation,
FullJournalCitation,
Expand All @@ -35,15 +35,16 @@


def get_citations(
plain_text: str,
plain_text: str = "",
remove_ambiguous: bool = False,
tokenizer: Tokenizer = default_tokenizer,
markup_text: str = "",
clean_steps: Optional[Iterable[Union[str, Callable[[str], str]]]] = None,
) -> List[CitationBase]:
"""This is eyecite's main workhorse function. Given a string of text
(e.g., a judicial opinion or other legal document), return a list of
(e.g., a judicial opinion or other legal doc), return a list of
`eyecite.models.CitationBase` objects representing the citations found
in the document.
in the doc.

Args:
plain_text: The text to parse. You may wish to use the
Expand All @@ -57,23 +58,22 @@ def get_citations(
markup_text: if the source text has markup (XML or HTML mostly), pass
it to extract ReferenceCitations that may be detectable via
markup style tags
clean_steps: Cleanup steps and methods

Returns:
A list of `eyecite.models.CitationBase` objects
"""
if plain_text == "eyecite":
return joke_cite

words, citation_tokens = tokenizer.tokenize(plain_text)
document = Document(
plain_text=plain_text,
markup_text=markup_text,
clean_steps=clean_steps,
)
document.tokenize(tokenizer=tokenizer)
citations: list[CitationBase] = []

if markup_text:
plain_to_markup = SpanUpdater(plain_text, markup_text)
markup_to_plain = SpanUpdater(markup_text, plain_text)
else:
plain_to_markup, markup_to_plain = None, None

for i, token in citation_tokens:
for i, token in document.citation_tokens:
citation: CitationBase
token_type = type(token)

Expand All @@ -84,36 +84,30 @@ def get_citations(
if token_type is CitationToken:
citation_token = cast(CitationToken, token)
if citation_token.short:
citation = _extract_shortform_citation(words, i)
citation = _extract_shortform_citation(document.words, i)
else:
citation = _extract_full_citation(words, i)
citation = _extract_full_citation(document.words, i)
if citations and isinstance(citation, FullCitation):
citation.is_parallel_citation(citations[-1])

# Check for reference citations that follow a full citation
# Using the plaintiff or defendant
references = extract_reference_citations(
citation,
plain_text,
markup_text,
plain_to_markup,
markup_to_plain,
)
citations.extend(references)
# Check for reference citations that follow a full citation
# Using the plaintiff or defendant
references = extract_reference_citations(citation, document)
citations.extend(references)

# CASE 2: Token is an "Id." or "Ibid." reference.
# In this case, the citation should simply be to the item cited
# immediately prior, but for safety we will leave that resolution up
# to the user.
elif token_type is IdToken:
citation = _extract_id_citation(words, i)
citation = _extract_id_citation(document.words, i)

# CASE 3: Token is a "supra" reference.
# In this case, we're not sure yet what the citation's antecedent is.
# It could be any of the previous citations above. Thus, like an Id.
# citation, for safety we won't resolve this reference yet.
elif token_type is SupraToken:
citation = _extract_supra_citation(words, i)
citation = _extract_supra_citation(document.words, i)

# CASE 4: Token is a section marker.
# In this case, it's likely that this is a reference to a citation,
Expand All @@ -137,48 +131,36 @@ def get_citations(
citations = disambiguate_reporters(citations)

# Returns a list of citations ordered in the sequence that they appear in
# the document. The ordering of this list is important for reconstructing
# the doc. The ordering of this list is important for reconstructing
# the references of the ShortCaseCitation, SupraCitation, and
# IdCitation and ReferenceCitation objects.
return citations


def extract_reference_citations(
citation: FullCitation,
plain_text: str,
markup_text: str = "",
plain_to_markup: Optional[SpanUpdater] = None,
markup_to_plain: Optional[SpanUpdater] = None,
citation: ResourceCitation, document: Document
) -> List[ReferenceCitation]:
"""Extract reference citations that follow a full citation

:param citation: the full case citation found
:param plain_text: the text
:param markup_text: optional argument for source text with XML style tags
that may help extracting name-only ReferenceCitations
:param plain_to_markup: a SpanUpdater from plain or clean text to
marked up text
:param markup_to_plain: a SpanUpdater from marked up text to plain text
:param document: document object to parse

:return: Reference citations
"""
if len(plain_text) <= citation.span()[-1]:
if len(document.plain_text) <= citation.span()[-1]:
return []
if not isinstance(citation, FullCaseCitation):
return []

reference_citations = extract_pincited_reference_citations(
citation, plain_text
citation, document.plain_text
)

if markup_text:
if document.markup_text:
reference_citations.extend(
find_reference_citations_from_markup(
markup_text,
plain_text,
document,
[citation],
plain_to_markup,
markup_to_plain,
)
)

Expand Down Expand Up @@ -392,11 +374,8 @@ def _extract_id_citation(


def find_reference_citations_from_markup(
markup_text: str,
plain_text: str,
document: Document,
citations: list,
plain_to_markup: Optional[SpanUpdater] = None,
markup_to_plain: Optional[SpanUpdater] = None,
) -> list[ReferenceCitation]:
"""Use HTML/XML style tags and parties names to find ReferenceCitations

Expand All @@ -420,11 +399,6 @@ def find_reference_citations_from_markup(

:return: a list of ReferenceCitations
"""
if not markup_to_plain:
markup_to_plain = SpanUpdater(markup_text, plain_text)
if not plain_to_markup:
plain_to_markup = SpanUpdater(plain_text, markup_text)

references = []
tags = "|".join(["em", "i"])

Expand Down Expand Up @@ -453,30 +427,39 @@ def find_reference_citations_from_markup(
# `utils.maybe_balance_style tags` for reference; it has some tolerance
# which may be enough for these citations
regex = rf"<(?:{tags})>\s*({'|'.join(regexes)})[:;.,\s]*</(?:{tags})>"
start_in_markup = plain_to_markup.update(

if (
not document.plain_to_markup
or not document.markup_to_plain
or not document.markup_text
):
# ensure we have markup text
return []
start_in_markup = document.plain_to_markup.update(
citation.span()[0], bisect_right
)
for match in re.finditer(regex, markup_text[start_in_markup:]):
full_start_in_plain = markup_to_plain.update(
for match in re.finditer(
regex, document.markup_text[start_in_markup:]
):
full_start_in_plain = document.markup_to_plain.update(
start_in_markup + match.start(), bisect_left
)
full_end_in_plain = markup_to_plain.update(
full_end_in_plain = document.markup_to_plain.update(
start_in_markup + match.end(), bisect_right
)

# the first group [match.group(0)] is the whole match,
# with whitespace and punctuation. the second group, match.group(1)
# is the only capturing and named group
start_in_plain = markup_to_plain.update(
start_in_plain = document.markup_to_plain.update(
start_in_markup + match.start(1), bisect_left
)
end_in_plain = markup_to_plain.update(
end_in_plain = document.markup_to_plain.update(
start_in_markup + match.end(1), bisect_right
)

reference = ReferenceCitation(
token=CaseReferenceToken(
data=plain_text[start_in_plain:end_in_plain],
data=document.plain_text[start_in_plain:end_in_plain],
start=start_in_plain,
end=end_in_plain,
),
Expand Down
40 changes: 40 additions & 0 deletions eyecite/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
Callable,
Dict,
Hashable,
Iterable,
List,
Optional,
Sequence,
Expand All @@ -15,6 +16,8 @@
cast,
)

from eyecite import clean_text
from eyecite.annotate import SpanUpdater
from eyecite.utils import REPORTERS_THAT_NEED_PAGE_CORRECTION, hash_sha256

ResourceType = Hashable
Expand Down Expand Up @@ -861,3 +864,40 @@ def __hash__(self):

def __eq__(self, other):
return self.__hash__() == other.__hash__()


@dataclass(eq=False, unsafe_hash=False)
class Document:
plain_text: str = ""
markup_text: Optional[str] = ""
citation_tokens: list[Tuple[int, Token]] = field(default_factory=list)
words: Tokens = field(default_factory=list)
plain_to_markup: Optional[SpanUpdater] = field(default=None, init=False)
markup_to_plain: Optional[SpanUpdater] = field(default=None, init=False)
clean_steps: Optional[Iterable[Union[str, Callable[[str], str]]]] = field(
default_factory=list
)

def __post_init__(self):
if self.plain_text and self.clean_steps:
self.plain_text = clean_text(self.plain_text, self.clean_steps)

if self.markup_text != "":
if "html" not in self.clean_steps:
raise (
"`html` is a required cleanup step for markup text",
self.markup_text,
)

self.plain_text = clean_text(self.markup_text, self.clean_steps)

self.plain_to_markup = SpanUpdater(
self.plain_text, self.markup_text
)
self.markup_to_plain = SpanUpdater(
self.markup_text, self.plain_text
)

def tokenize(self, tokenizer):
# Tokenize the document and store the results in the document object
self.words, self.citation_tokens = tokenizer.tokenize(self.plain_text)
15 changes: 11 additions & 4 deletions tests/test_AnnotateTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from unittest import TestCase

from eyecite import annotate_citations, clean_text, get_citations
from eyecite.models import Document
from eyecite.utils import maybe_balance_style_tags


Expand Down Expand Up @@ -207,12 +208,18 @@ def lower_annotator(before, text, after):
clean_steps=clean_steps,
annotate_args=annotate_kwargs,
):
get_citations_args = {}
if annotate_kwargs.pop("use_markup", False):
get_citations_args = {"markup_text": source_text}
else:
get_citations_args = {"plain_text": source_text}

plain_text = clean_text(source_text, clean_steps)
cites = get_citations(plain_text, **get_citations_args)
document = Document(
**get_citations_args, clean_steps=clean_steps
)

cites = get_citations(
**get_citations_args, clean_steps=clean_steps
)
annotations = [
(c.span(), f"<{i}>", f"</{i}>")
for i, c in enumerate(cites)
Expand All @@ -225,7 +232,7 @@ def lower_annotator(before, text, after):
]

annotated = annotate_citations(
plain_text,
document.plain_text,
annotations,
source_text=source_text,
**annotate_kwargs,
Expand Down
Loading
Loading