From 8e66ed1d06fb8fcad06b6b03247a17bc867820ba Mon Sep 17 00:00:00 2001 From: William Palin Date: Wed, 19 Mar 2025 13:30:06 -0400 Subject: [PATCH 1/6] feat(eyecite): Init Document object for better citation parsing With the introduction of markup parsing, handling multiple parameters became unwieldy. To address this, I added `Document` object that encapsulates: - Plain and markup text - Span updates for text mapping - Tokenized words and extracted citation tokens This refactor should enable: - **More complex parsing**, using html - **Simplified logic** by reducing the number of parameters passed around Additionally, it simplifies the api call by allowing a user to pass in markup or plain text but not needing to do both --- eyecite/find.py | 111 ++++++++++++++++--------------------- eyecite/models.py | 40 +++++++++++++ tests/test_AnnotateTest.py | 15 +++-- tests/test_FindTest.py | 46 ++++++++------- tests/test_ResolveTest.py | 8 ++- 5 files changed, 131 insertions(+), 89 deletions(-) diff --git a/eyecite/find.py b/eyecite/find.py index de322d60..04577753 100644 --- a/eyecite/find.py +++ b/eyecite/find.py @@ -1,8 +1,7 @@ import re from bisect import bisect_left, bisect_right -from typing import List, Optional, Type, cast +from typing import Callable, Iterable, List, Optional, Type, Union, cast -from eyecite.annotate import SpanUpdater from eyecite.helpers import ( disambiguate_reporters, extract_pin_cite, @@ -14,6 +13,7 @@ CaseReferenceToken, CitationBase, CitationToken, + Document, FullCaseCitation, FullCitation, FullJournalCitation, @@ -35,15 +35,16 @@ def get_citations( - plain_text: str, + plain_text: str = "", remove_ambiguous: bool = False, tokenizer: Tokenizer = default_tokenizer, markup_text: str = "", + clean_steps: Optional[Iterable[Union[str, Callable[[str], str]]]] = None, ) -> List[CitationBase]: """This is eyecite's main workhorse function. Given a string of text - (e.g., a judicial opinion or other legal document), return a list of + (e.g., a judicial opinion or other legal doc), return a list of `eyecite.models.CitationBase` objects representing the citations found - in the document. + in the doc. Args: plain_text: The text to parse. You may wish to use the @@ -57,6 +58,7 @@ def get_citations( markup_text: if the source text has markup (XML or HTML mostly), pass it to extract ReferenceCitations that may be detectable via markup style tags + clean_steps: Cleanup steps and methods Returns: A list of `eyecite.models.CitationBase` objects @@ -64,16 +66,14 @@ def get_citations( if plain_text == "eyecite": return joke_cite - words, citation_tokens = tokenizer.tokenize(plain_text) + document = Document( + plain_text=plain_text, + markup_text=markup_text, + clean_steps=clean_steps, + ) + document.tokenize(tokenizer=tokenizer) citations: list[CitationBase] = [] - - if markup_text: - plain_to_markup = SpanUpdater(plain_text, markup_text) - markup_to_plain = SpanUpdater(markup_text, plain_text) - else: - plain_to_markup, markup_to_plain = None, None - - for i, token in citation_tokens: + for i, token in document.citation_tokens: citation: CitationBase token_type = type(token) @@ -84,36 +84,30 @@ def get_citations( if token_type is CitationToken: citation_token = cast(CitationToken, token) if citation_token.short: - citation = _extract_shortform_citation(words, i) + citation = _extract_shortform_citation(document.words, i) else: - citation = _extract_full_citation(words, i) + citation = _extract_full_citation(document.words, i) if citations and isinstance(citation, FullCitation): citation.is_parallel_citation(citations[-1]) - # Check for reference citations that follow a full citation - # Using the plaintiff or defendant - references = extract_reference_citations( - citation, - plain_text, - markup_text, - plain_to_markup, - markup_to_plain, - ) - citations.extend(references) + # Check for reference citations that follow a full citation + # Using the plaintiff or defendant + references = extract_reference_citations(citation, document) + citations.extend(references) # CASE 2: Token is an "Id." or "Ibid." reference. # In this case, the citation should simply be to the item cited # immediately prior, but for safety we will leave that resolution up # to the user. elif token_type is IdToken: - citation = _extract_id_citation(words, i) + citation = _extract_id_citation(document.words, i) # CASE 3: Token is a "supra" reference. # In this case, we're not sure yet what the citation's antecedent is. # It could be any of the previous citations above. Thus, like an Id. # citation, for safety we won't resolve this reference yet. elif token_type is SupraToken: - citation = _extract_supra_citation(words, i) + citation = _extract_supra_citation(document.words, i) # CASE 4: Token is a section marker. # In this case, it's likely that this is a reference to a citation, @@ -137,48 +131,36 @@ def get_citations( citations = disambiguate_reporters(citations) # Returns a list of citations ordered in the sequence that they appear in - # the document. The ordering of this list is important for reconstructing + # the doc. The ordering of this list is important for reconstructing # the references of the ShortCaseCitation, SupraCitation, and # IdCitation and ReferenceCitation objects. return citations def extract_reference_citations( - citation: FullCitation, - plain_text: str, - markup_text: str = "", - plain_to_markup: Optional[SpanUpdater] = None, - markup_to_plain: Optional[SpanUpdater] = None, + citation: ResourceCitation, document: Document ) -> List[ReferenceCitation]: """Extract reference citations that follow a full citation :param citation: the full case citation found - :param plain_text: the text - :param markup_text: optional argument for source text with XML style tags - that may help extracting name-only ReferenceCitations - :param plain_to_markup: a SpanUpdater from plain or clean text to - marked up text - :param markup_to_plain: a SpanUpdater from marked up text to plain text + :param document: document object to parse :return: Reference citations """ - if len(plain_text) <= citation.span()[-1]: + if len(document.plain_text) <= citation.span()[-1]: return [] if not isinstance(citation, FullCaseCitation): return [] reference_citations = extract_pincited_reference_citations( - citation, plain_text + citation, document.plain_text ) - if markup_text: + if document.markup_text: reference_citations.extend( find_reference_citations_from_markup( - markup_text, - plain_text, + document, [citation], - plain_to_markup, - markup_to_plain, ) ) @@ -392,11 +374,8 @@ def _extract_id_citation( def find_reference_citations_from_markup( - markup_text: str, - plain_text: str, + document: Document, citations: list, - plain_to_markup: Optional[SpanUpdater] = None, - markup_to_plain: Optional[SpanUpdater] = None, ) -> list[ReferenceCitation]: """Use HTML/XML style tags and parties names to find ReferenceCitations @@ -420,11 +399,6 @@ def find_reference_citations_from_markup( :return: a list of ReferenceCitations """ - if not markup_to_plain: - markup_to_plain = SpanUpdater(markup_text, plain_text) - if not plain_to_markup: - plain_to_markup = SpanUpdater(plain_text, markup_text) - references = [] tags = "|".join(["em", "i"]) @@ -453,30 +427,39 @@ def find_reference_citations_from_markup( # `utils.maybe_balance_style tags` for reference; it has some tolerance # which may be enough for these citations regex = rf"<(?:{tags})>\s*({'|'.join(regexes)})[:;.,\s]*" - start_in_markup = plain_to_markup.update( + + if ( + not document.plain_to_markup + or not document.markup_to_plain + or not document.markup_text + ): + # ensure we have markup text + return [] + start_in_markup = document.plain_to_markup.update( citation.span()[0], bisect_right ) - for match in re.finditer(regex, markup_text[start_in_markup:]): - full_start_in_plain = markup_to_plain.update( + for match in re.finditer( + regex, document.markup_text[start_in_markup:] + ): + full_start_in_plain = document.markup_to_plain.update( start_in_markup + match.start(), bisect_left ) - full_end_in_plain = markup_to_plain.update( + full_end_in_plain = document.markup_to_plain.update( start_in_markup + match.end(), bisect_right ) # the first group [match.group(0)] is the whole match, # with whitespace and punctuation. the second group, match.group(1) # is the only capturing and named group - start_in_plain = markup_to_plain.update( + start_in_plain = document.markup_to_plain.update( start_in_markup + match.start(1), bisect_left ) - end_in_plain = markup_to_plain.update( + end_in_plain = document.markup_to_plain.update( start_in_markup + match.end(1), bisect_right ) - reference = ReferenceCitation( token=CaseReferenceToken( - data=plain_text[start_in_plain:end_in_plain], + data=document.plain_text[start_in_plain:end_in_plain], start=start_in_plain, end=end_in_plain, ), diff --git a/eyecite/models.py b/eyecite/models.py index 0090d584..5e78eb4e 100644 --- a/eyecite/models.py +++ b/eyecite/models.py @@ -7,6 +7,7 @@ Callable, Dict, Hashable, + Iterable, List, Optional, Sequence, @@ -15,6 +16,8 @@ cast, ) +from eyecite import clean_text +from eyecite.annotate import SpanUpdater from eyecite.utils import REPORTERS_THAT_NEED_PAGE_CORRECTION, hash_sha256 ResourceType = Hashable @@ -861,3 +864,40 @@ def __hash__(self): def __eq__(self, other): return self.__hash__() == other.__hash__() + + +@dataclass(eq=False, unsafe_hash=False) +class Document: + plain_text: str = "" + markup_text: Optional[str] = "" + citation_tokens: list[Tuple[int, Token]] = field(default_factory=list) + words: Tokens = field(default_factory=list) + plain_to_markup: Optional[SpanUpdater] = field(default=None, init=False) + markup_to_plain: Optional[SpanUpdater] = field(default=None, init=False) + clean_steps: Optional[Iterable[Union[str, Callable[[str], str]]]] = field( + default_factory=list + ) + + def __post_init__(self): + if self.plain_text and self.clean_steps: + self.plain_text = clean_text(self.plain_text, self.clean_steps) + + if self.markup_text != "": + if "html" not in self.clean_steps: + raise ( + "`html` is a required cleanup step for markup text", + self.markup_text, + ) + + self.plain_text = clean_text(self.markup_text, self.clean_steps) + + self.plain_to_markup = SpanUpdater( + self.plain_text, self.markup_text + ) + self.markup_to_plain = SpanUpdater( + self.markup_text, self.plain_text + ) + + def tokenize(self, tokenizer): + # Tokenize the document and store the results in the document object + self.words, self.citation_tokens = tokenizer.tokenize(self.plain_text) diff --git a/tests/test_AnnotateTest.py b/tests/test_AnnotateTest.py index 46a49b2d..c4fa127f 100644 --- a/tests/test_AnnotateTest.py +++ b/tests/test_AnnotateTest.py @@ -3,6 +3,7 @@ from unittest import TestCase from eyecite import annotate_citations, clean_text, get_citations +from eyecite.models import Document from eyecite.utils import maybe_balance_style_tags @@ -207,12 +208,18 @@ def lower_annotator(before, text, after): clean_steps=clean_steps, annotate_args=annotate_kwargs, ): - get_citations_args = {} if annotate_kwargs.pop("use_markup", False): get_citations_args = {"markup_text": source_text} + else: + get_citations_args = {"plain_text": source_text} - plain_text = clean_text(source_text, clean_steps) - cites = get_citations(plain_text, **get_citations_args) + document = Document( + **get_citations_args, clean_steps=clean_steps + ) + + cites = get_citations( + **get_citations_args, clean_steps=clean_steps + ) annotations = [ (c.span(), f"<{i}>", f"") for i, c in enumerate(cites) @@ -225,7 +232,7 @@ def lower_annotator(before, text, after): ] annotated = annotate_citations( - plain_text, + document.plain_text, annotations, source_text=source_text, **annotate_kwargs, diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py index 51ba4ef7..64406e79 100644 --- a/tests/test_FindTest.py +++ b/tests/test_FindTest.py @@ -3,13 +3,14 @@ from datetime import datetime from unittest import TestCase -from eyecite import clean_text, get_citations +from eyecite import get_citations from eyecite.find import extract_reference_citations from eyecite.helpers import filter_citations # by default tests use a cache for speed # call tests with `EYECITE_CACHE_DIR= python ...` to disable cache from eyecite.models import ( + Document, FullCaseCitation, ReferenceCitation, ResourceCitation, @@ -60,15 +61,17 @@ def get_comparison_attrs(cite): tokenizers = tested_tokenizers for q, expected_cites, *kwargs in test_pairs: kwargs = kwargs[0] if kwargs else {} - clean_steps = kwargs.pop("clean", []) - clean_q = clean_text(q, clean_steps) + clean_steps = kwargs.get("clean_steps", []) for tokenizer in tokenizers: with self.subTest( message, tokenizer=type(tokenizer).__name__, q=q ): - cites_found = get_citations( - clean_q, tokenizer=tokenizer, **kwargs - ) + if "html" in clean_steps: + kwargs["markup_text"] = q + else: + kwargs["plain_text"] = q + + cites_found = get_citations(tokenizer=tokenizer, **kwargs) self.assertEqual( [type(i) for i in cites_found], [type(i) for i in expected_cites], @@ -93,11 +96,11 @@ def test_find_citations(self): # Basic test with a line break ('1 U.S.\n1', [case_citation()], - {'clean': ['all_whitespace']}), + {'clean_steps': ['all_whitespace']}), # Basic test with a line break within a reporter ('1 U.\nS. 1', [case_citation(reporter_found='U. S.')], - {'clean': ['all_whitespace']}), + {'clean_steps': ['all_whitespace']}), # Basic test of non-case name before citation (should not be found) ('lissner test 1 U.S. 1', [case_citation()]), @@ -259,7 +262,7 @@ def test_find_citations(self): [supra_citation("supra,", metadata={'pin_cite': 'at 2', 'antecedent_guess': 'asdf'})], - {'clean': ['all_whitespace']}), + {'clean_steps': ['all_whitespace']}), # Test short form citation with a page range ('before asdf, 1 U. S., at 20-25', [case_citation(page='20', reporter_found='U. S.', short=True, @@ -383,7 +386,7 @@ def test_find_citations(self): # Test italicized Ibid. citation ('

before asdf. Ibid.

foo bar lorem

', [id_citation('Ibid.')], - {'clean': ['html', 'inline_whitespace']}), + {'clean_steps': ['html', 'inline_whitespace']}), # Test Id. citation ('foo v. bar 1 U.S. 12, 347-348. asdf. Id., at 123. foo bar', [case_citation(page='12', @@ -399,15 +402,15 @@ def test_find_citations(self): 'defendant': 'bar', 'pin_cite': '347-348'}), id_citation('Id.,', metadata={'pin_cite': 'at 123'})], - {'clean': ['all_whitespace']}), + {'clean_steps': ['all_whitespace']}), # Test italicized Id. citation ('

before asdf. Id., at 123.

foo bar

', [id_citation('Id.,', metadata={'pin_cite': 'at 123'})], - {'clean': ['html', 'inline_whitespace']}), + {'clean_steps': ['html', 'inline_whitespace']}), # Test italicized Id. citation with another HTML tag in the way ('

before asdf. Id., at 123.

foo bar

', [id_citation('Id.,', metadata={'pin_cite': 'at 123'})], - {'clean': ['html', 'inline_whitespace']}), + {'clean_steps': ['html', 'inline_whitespace']}), # Test weirder Id. citations (#1344) ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. ΒΆ 34. foo bar', [case_citation(page='12', @@ -517,7 +520,7 @@ def test_find_citations(self): metadata={'plaintiff': None, 'defendant': None, 'court': 'scotus'})], - {'clean': ['html', 'inline_whitespace']}), + {'clean_steps': ['html', 'inline_whitespace']}), # Test filtering overlapping citations - this finds four citations # but should filter down to three ("Miles v. Smith 1 Ga. 1; asdfasdf asd Something v. Else, 1 Miles 3; 1 Miles at 10", @@ -1012,7 +1015,10 @@ def test_reference_extraction_using_resolved_names(self): citations = get_citations(plain_text) found_cite = citations[0] found_cite.metadata.resolved_case_name = "State v. Wingler" - references = extract_reference_citations(found_cite, plain_text) + document = Document(plain_text=plain_text, markup_text="") + references = extract_reference_citations( + citation=found_cite, document=document + ) final_citations = filter_citations(citations + references) self.assertEqual( len(final_citations), 2, "There should only be 2 citations" @@ -1043,8 +1049,9 @@ def test_reference_extraction_from_markup(self): ex post facto scrutiny simply because it is consistent with punitive goals as well.\" 44 F.3d at 493.

""" - plain_text = clean_text(markup_text, ["html", "all_whitespace"]) - citations = get_citations(plain_text, markup_text=markup_text) + citations = get_citations( + markup_text=markup_text, clean_steps=["html", "all_whitespace"] + ) references = [c for c in citations if isinstance(c, ReferenceCitation)] # Tests both for the order and exact counts. Note that there is one # "Bae" in the text that should not be picked up: "Bae's argument"... @@ -1083,8 +1090,9 @@ def test_reference_filtering(self): """, ] for markup_text in texts: - plain_text = clean_text(markup_text, ["html", "all_whitespace"]) - citations = get_citations(plain_text, markup_text=markup_text) + citations = get_citations( + markup_text=markup_text, clean_steps=["html", "all_whitespace"] + ) self.assertFalse( any( [isinstance(cite, ReferenceCitation) for cite in citations] diff --git a/tests/test_ResolveTest.py b/tests/test_ResolveTest.py index 6c4539c4..d40fcb1b 100644 --- a/tests/test_ResolveTest.py +++ b/tests/test_ResolveTest.py @@ -5,7 +5,7 @@ from eyecite import get_citations from eyecite.find import extract_reference_citations from eyecite.helpers import filter_citations -from eyecite.models import FullCitation, Resource +from eyecite.models import Document, FullCitation, Resource from eyecite.resolve import resolve_citations @@ -52,6 +52,10 @@ def checkReferenceResolution( Returns: None """ + + document = Document( + plain_text=citation_text, + ) citations = get_citations(citation_text) if resolved_case_name_short: citations[0].metadata.resolved_case_name_short = ( @@ -59,7 +63,7 @@ def checkReferenceResolution( ) citations.extend( extract_reference_citations( - citations[0], citation_text # type: ignore[arg-type] + citations[0], document # type: ignore[arg-type] ) ) citations = filter_citations(citations) From de4cfd9b944af6bfa0a9891f83d456161d5fa575 Mon Sep 17 00:00:00 2001 From: William Palin Date: Wed, 19 Mar 2025 13:44:45 -0400 Subject: [PATCH 2/6] chore(changes): Update changes.md --- CHANGES.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index a03c6f06..a30173a0 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,10 +5,13 @@ The following changes are not yet released, but are code complete: Features: -- None +- Introduced `Document` object to encapsulate plain text, markup text, span updates, tokens, and citation strings. +- Simplifies citation processing by reducing parameter passing and improving maintainability (hopefully). +- Should enable more complex html parsing. Changes: -- None +- Moved text cleaning logic into `get_citations` for simpler call with markup +- Fixes: - Prefer the other full citation on overlap with nominative reporter From 5a3e813a06720861b978cbb2c07569e4af24f403 Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 20 Mar 2025 16:12:37 -0400 Subject: [PATCH 3/6] chore(find.py): Fix docstring --- eyecite/find.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/eyecite/find.py b/eyecite/find.py index 04577753..75b5ef6e 100644 --- a/eyecite/find.py +++ b/eyecite/find.py @@ -389,13 +389,9 @@ def find_reference_citations_from_markup( Creating the SpanUpdaters for each full citation will be too slow, re-use them if possible - :param markup_text: HTML or XML source - :param plain_text: cleaned text + :param document: Document object we are parsing :param citations: list of citations found over plain text. The full cites will be used to access parties names metadata - :param plain_to_markup: a SpanUpdater from plain or clean text to - marked up text - :param markup_to_plain: a SpanUpdater from marked up text to plain text :return: a list of ReferenceCitations """ From 8a225d37f4ffa5938996ffd359ceda5cb33af48f Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 21 Mar 2025 09:30:46 -0400 Subject: [PATCH 4/6] fix(benchmark): Update benchmark for new command Refactor call to use cleaning inside method --- benchmark/benchmark.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index f7e48f30..7414a2be 100644 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -52,19 +52,15 @@ def generate_branch_report(self, branch: str) -> None: or row["html_anon_2020"] or row["html"] ) + params = {"clean_steps": ["html", "inline_whitespace"]} if text: # Remove XML encodings from xml_harvard text = re.sub(r"^<\?xml.*?\?>", "", text, count=1) - opinion_text_is_marked_up = True + params['markup_text'] = text or "" else: - text = row["plain_text"] - opinion_text_is_marked_up = False + params['markup_text'] = row['plain_text'] - plain_text = clean_text(text, ["html", "inline_whitespace"]) - found_citations = get_citations( - plain_text, - markup_text=text if opinion_text_is_marked_up else "", - ) + found_citations = get_citations(**params) # Get the citation text string from the cite object cites = [cite.token.data for cite in found_citations if cite.token] From e3415108c229890d7621d308c4f79d078ab2a661 Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 21 Mar 2025 09:42:15 -0400 Subject: [PATCH 5/6] fix(benchmark): Lint --- benchmark/benchmark.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 7414a2be..a951b7c2 100644 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -8,10 +8,11 @@ import sys from io import StringIO from pathlib import Path +from typing import Any, Dict from matplotlib import pyplot as plt # type: ignore -from eyecite import clean_text, get_citations +from eyecite import get_citations SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.dirname(SCRIPT_DIR)) @@ -45,20 +46,22 @@ def generate_branch_report(self, branch: str) -> None: now = datetime.datetime.now() data = [] for row in csv_data: - text = ( + text: str = ( row["xml_harvard"] or row["html_lawbox"] or row["html_columbia"] or row["html_anon_2020"] or row["html"] ) - params = {"clean_steps": ["html", "inline_whitespace"]} + params: Dict[str, Any] = { + "clean_steps": ["html", "inline_whitespace"] + } if text: # Remove XML encodings from xml_harvard text = re.sub(r"^<\?xml.*?\?>", "", text, count=1) - params['markup_text'] = text or "" + params["markup_text"] = text or "" else: - params['markup_text'] = row['plain_text'] + params["markup_text"] = row["plain_text"] found_citations = get_citations(**params) From ba78e8e5857ef6a43e01984185359414b2da5b60 Mon Sep 17 00:00:00 2001 From: William Palin Date: Fri, 21 Mar 2025 09:48:40 -0400 Subject: [PATCH 6/6] fix(find): Check only for references after full citation --- eyecite/find.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/eyecite/find.py b/eyecite/find.py index 43ab31de..cef72d1c 100644 --- a/eyecite/find.py +++ b/eyecite/find.py @@ -95,10 +95,10 @@ def get_citations( pre = cast(FullCaseCitation, citations[-1]) # type: ignore citation.is_parallel_citation(pre) - # Check for reference citations that follow a full citation - # Using the plaintiff or defendant - references = extract_reference_citations(citation, document) - citations.extend(references) + # Check for reference citations that follow a full citation + # Using the plaintiff or defendant + references = extract_reference_citations(citation, document) + citations.extend(references) # CASE 2: Token is an "Id." or "Ibid." reference. # In this case, the citation should simply be to the item cited