From 8e66ed1d06fb8fcad06b6b03247a17bc867820ba Mon Sep 17 00:00:00 2001
From: William Palin <bill@free.law>
Date: Wed, 19 Mar 2025 13:30:06 -0400
Subject: [PATCH 1/6] feat(eyecite): Init Document object for better citation
 parsing

With the introduction of markup parsing,
handling multiple parameters became unwieldy.
 To address this, I added `Document` object that encapsulates:

- Plain and markup text
- Span updates for text mapping
- Tokenized words and extracted citation tokens

This refactor should enable:
- **More complex parsing**, using html
- **Simplified logic** by reducing the number of parameters passed around

Additionally, it simplifies the api call
by allowing a user to pass in markup or plain text but not
needing to do both
---
 eyecite/find.py            | 111 ++++++++++++++++---------------------
 eyecite/models.py          |  40 +++++++++++++
 tests/test_AnnotateTest.py |  15 +++--
 tests/test_FindTest.py     |  46 ++++++++-------
 tests/test_ResolveTest.py  |   8 ++-
 5 files changed, 131 insertions(+), 89 deletions(-)

diff --git a/eyecite/find.py b/eyecite/find.py
index de322d60..04577753 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -1,8 +1,7 @@
 import re
 from bisect import bisect_left, bisect_right
-from typing import List, Optional, Type, cast
+from typing import Callable, Iterable, List, Optional, Type, Union, cast
 
-from eyecite.annotate import SpanUpdater
 from eyecite.helpers import (
     disambiguate_reporters,
     extract_pin_cite,
@@ -14,6 +13,7 @@
     CaseReferenceToken,
     CitationBase,
     CitationToken,
+    Document,
     FullCaseCitation,
     FullCitation,
     FullJournalCitation,
@@ -35,15 +35,16 @@
 
 
 def get_citations(
-    plain_text: str,
+    plain_text: str = "",
     remove_ambiguous: bool = False,
     tokenizer: Tokenizer = default_tokenizer,
     markup_text: str = "",
+    clean_steps: Optional[Iterable[Union[str, Callable[[str], str]]]] = None,
 ) -> List[CitationBase]:
     """This is eyecite's main workhorse function. Given a string of text
-    (e.g., a judicial opinion or other legal document), return a list of
+    (e.g., a judicial opinion or other legal doc), return a list of
     `eyecite.models.CitationBase` objects representing the citations found
-    in the document.
+    in the doc.
 
     Args:
         plain_text: The text to parse. You may wish to use the
@@ -57,6 +58,7 @@ def get_citations(
         markup_text: if the source text has markup (XML or HTML mostly), pass
             it to extract ReferenceCitations that may be detectable via
             markup style tags
+        clean_steps: Cleanup steps and methods
 
     Returns:
         A list of `eyecite.models.CitationBase` objects
@@ -64,16 +66,14 @@ def get_citations(
     if plain_text == "eyecite":
         return joke_cite
 
-    words, citation_tokens = tokenizer.tokenize(plain_text)
+    document = Document(
+        plain_text=plain_text,
+        markup_text=markup_text,
+        clean_steps=clean_steps,
+    )
+    document.tokenize(tokenizer=tokenizer)
     citations: list[CitationBase] = []
-
-    if markup_text:
-        plain_to_markup = SpanUpdater(plain_text, markup_text)
-        markup_to_plain = SpanUpdater(markup_text, plain_text)
-    else:
-        plain_to_markup, markup_to_plain = None, None
-
-    for i, token in citation_tokens:
+    for i, token in document.citation_tokens:
         citation: CitationBase
         token_type = type(token)
 
@@ -84,36 +84,30 @@ def get_citations(
         if token_type is CitationToken:
             citation_token = cast(CitationToken, token)
             if citation_token.short:
-                citation = _extract_shortform_citation(words, i)
+                citation = _extract_shortform_citation(document.words, i)
             else:
-                citation = _extract_full_citation(words, i)
+                citation = _extract_full_citation(document.words, i)
                 if citations and isinstance(citation, FullCitation):
                     citation.is_parallel_citation(citations[-1])
 
-                # Check for reference citations that follow a full citation
-                # Using the plaintiff or defendant
-                references = extract_reference_citations(
-                    citation,
-                    plain_text,
-                    markup_text,
-                    plain_to_markup,
-                    markup_to_plain,
-                )
-                citations.extend(references)
+            # Check for reference citations that follow a full citation
+            # Using the plaintiff or defendant
+            references = extract_reference_citations(citation, document)
+            citations.extend(references)
 
         # CASE 2: Token is an "Id." or "Ibid." reference.
         # In this case, the citation should simply be to the item cited
         # immediately prior, but for safety we will leave that resolution up
         # to the user.
         elif token_type is IdToken:
-            citation = _extract_id_citation(words, i)
+            citation = _extract_id_citation(document.words, i)
 
         # CASE 3: Token is a "supra" reference.
         # In this case, we're not sure yet what the citation's antecedent is.
         # It could be any of the previous citations above. Thus, like an Id.
         # citation, for safety we won't resolve this reference yet.
         elif token_type is SupraToken:
-            citation = _extract_supra_citation(words, i)
+            citation = _extract_supra_citation(document.words, i)
 
         # CASE 4: Token is a section marker.
         # In this case, it's likely that this is a reference to a citation,
@@ -137,48 +131,36 @@ def get_citations(
         citations = disambiguate_reporters(citations)
 
     # Returns a list of citations ordered in the sequence that they appear in
-    # the document. The ordering of this list is important for reconstructing
+    # the doc. The ordering of this list is important for reconstructing
     # the references of the ShortCaseCitation, SupraCitation, and
     # IdCitation and ReferenceCitation objects.
     return citations
 
 
 def extract_reference_citations(
-    citation: FullCitation,
-    plain_text: str,
-    markup_text: str = "",
-    plain_to_markup: Optional[SpanUpdater] = None,
-    markup_to_plain: Optional[SpanUpdater] = None,
+    citation: ResourceCitation, document: Document
 ) -> List[ReferenceCitation]:
     """Extract reference citations that follow a full citation
 
     :param citation: the full case citation found
-    :param plain_text: the text
-    :param markup_text: optional argument for source text with XML style tags
-        that may help extracting name-only ReferenceCitations
-    :param plain_to_markup: a SpanUpdater from plain or clean text to
-        marked up text
-    :param markup_to_plain: a SpanUpdater from marked up text to plain text
+    :param document: document object to parse
 
     :return: Reference citations
     """
-    if len(plain_text) <= citation.span()[-1]:
+    if len(document.plain_text) <= citation.span()[-1]:
         return []
     if not isinstance(citation, FullCaseCitation):
         return []
 
     reference_citations = extract_pincited_reference_citations(
-        citation, plain_text
+        citation, document.plain_text
     )
 
-    if markup_text:
+    if document.markup_text:
         reference_citations.extend(
             find_reference_citations_from_markup(
-                markup_text,
-                plain_text,
+                document,
                 [citation],
-                plain_to_markup,
-                markup_to_plain,
             )
         )
 
@@ -392,11 +374,8 @@ def _extract_id_citation(
 
 
 def find_reference_citations_from_markup(
-    markup_text: str,
-    plain_text: str,
+    document: Document,
     citations: list,
-    plain_to_markup: Optional[SpanUpdater] = None,
-    markup_to_plain: Optional[SpanUpdater] = None,
 ) -> list[ReferenceCitation]:
     """Use HTML/XML style tags and parties names to find ReferenceCitations
 
@@ -420,11 +399,6 @@ def find_reference_citations_from_markup(
 
     :return: a list of ReferenceCitations
     """
-    if not markup_to_plain:
-        markup_to_plain = SpanUpdater(markup_text, plain_text)
-    if not plain_to_markup:
-        plain_to_markup = SpanUpdater(plain_text, markup_text)
-
     references = []
     tags = "|".join(["em", "i"])
 
@@ -453,30 +427,39 @@ def find_reference_citations_from_markup(
         # `utils.maybe_balance_style tags` for reference; it has some tolerance
         # which may be enough for these citations
         regex = rf"<(?:{tags})>\s*({'|'.join(regexes)})[:;.,\s]*</(?:{tags})>"
-        start_in_markup = plain_to_markup.update(
+
+        if (
+            not document.plain_to_markup
+            or not document.markup_to_plain
+            or not document.markup_text
+        ):
+            # ensure we have markup text
+            return []
+        start_in_markup = document.plain_to_markup.update(
             citation.span()[0], bisect_right
         )
-        for match in re.finditer(regex, markup_text[start_in_markup:]):
-            full_start_in_plain = markup_to_plain.update(
+        for match in re.finditer(
+            regex, document.markup_text[start_in_markup:]
+        ):
+            full_start_in_plain = document.markup_to_plain.update(
                 start_in_markup + match.start(), bisect_left
             )
-            full_end_in_plain = markup_to_plain.update(
+            full_end_in_plain = document.markup_to_plain.update(
                 start_in_markup + match.end(), bisect_right
             )
 
             # the first group [match.group(0)] is the whole match,
             # with whitespace and punctuation. the second group, match.group(1)
             # is the only capturing and named group
-            start_in_plain = markup_to_plain.update(
+            start_in_plain = document.markup_to_plain.update(
                 start_in_markup + match.start(1), bisect_left
             )
-            end_in_plain = markup_to_plain.update(
+            end_in_plain = document.markup_to_plain.update(
                 start_in_markup + match.end(1), bisect_right
             )
-
             reference = ReferenceCitation(
                 token=CaseReferenceToken(
-                    data=plain_text[start_in_plain:end_in_plain],
+                    data=document.plain_text[start_in_plain:end_in_plain],
                     start=start_in_plain,
                     end=end_in_plain,
                 ),
diff --git a/eyecite/models.py b/eyecite/models.py
index 0090d584..5e78eb4e 100644
--- a/eyecite/models.py
+++ b/eyecite/models.py
@@ -7,6 +7,7 @@
     Callable,
     Dict,
     Hashable,
+    Iterable,
     List,
     Optional,
     Sequence,
@@ -15,6 +16,8 @@
     cast,
 )
 
+from eyecite import clean_text
+from eyecite.annotate import SpanUpdater
 from eyecite.utils import REPORTERS_THAT_NEED_PAGE_CORRECTION, hash_sha256
 
 ResourceType = Hashable
@@ -861,3 +864,40 @@ def __hash__(self):
 
     def __eq__(self, other):
         return self.__hash__() == other.__hash__()
+
+
+@dataclass(eq=False, unsafe_hash=False)
+class Document:
+    plain_text: str = ""
+    markup_text: Optional[str] = ""
+    citation_tokens: list[Tuple[int, Token]] = field(default_factory=list)
+    words: Tokens = field(default_factory=list)
+    plain_to_markup: Optional[SpanUpdater] = field(default=None, init=False)
+    markup_to_plain: Optional[SpanUpdater] = field(default=None, init=False)
+    clean_steps: Optional[Iterable[Union[str, Callable[[str], str]]]] = field(
+        default_factory=list
+    )
+
+    def __post_init__(self):
+        if self.plain_text and self.clean_steps:
+            self.plain_text = clean_text(self.plain_text, self.clean_steps)
+
+        if self.markup_text != "":
+            if "html" not in self.clean_steps:
+                raise (
+                    "`html` is a required cleanup step for markup text",
+                    self.markup_text,
+                )
+
+            self.plain_text = clean_text(self.markup_text, self.clean_steps)
+
+            self.plain_to_markup = SpanUpdater(
+                self.plain_text, self.markup_text
+            )
+            self.markup_to_plain = SpanUpdater(
+                self.markup_text, self.plain_text
+            )
+
+    def tokenize(self, tokenizer):
+        # Tokenize the document and store the results in the document object
+        self.words, self.citation_tokens = tokenizer.tokenize(self.plain_text)
diff --git a/tests/test_AnnotateTest.py b/tests/test_AnnotateTest.py
index 46a49b2d..c4fa127f 100644
--- a/tests/test_AnnotateTest.py
+++ b/tests/test_AnnotateTest.py
@@ -3,6 +3,7 @@
 from unittest import TestCase
 
 from eyecite import annotate_citations, clean_text, get_citations
+from eyecite.models import Document
 from eyecite.utils import maybe_balance_style_tags
 
 
@@ -207,12 +208,18 @@ def lower_annotator(before, text, after):
                 clean_steps=clean_steps,
                 annotate_args=annotate_kwargs,
             ):
-                get_citations_args = {}
                 if annotate_kwargs.pop("use_markup", False):
                     get_citations_args = {"markup_text": source_text}
+                else:
+                    get_citations_args = {"plain_text": source_text}
 
-                plain_text = clean_text(source_text, clean_steps)
-                cites = get_citations(plain_text, **get_citations_args)
+                document = Document(
+                    **get_citations_args, clean_steps=clean_steps
+                )
+
+                cites = get_citations(
+                    **get_citations_args, clean_steps=clean_steps
+                )
                 annotations = [
                     (c.span(), f"<{i}>", f"</{i}>")
                     for i, c in enumerate(cites)
@@ -225,7 +232,7 @@ def lower_annotator(before, text, after):
                     ]
 
                 annotated = annotate_citations(
-                    plain_text,
+                    document.plain_text,
                     annotations,
                     source_text=source_text,
                     **annotate_kwargs,
diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
index 51ba4ef7..64406e79 100644
--- a/tests/test_FindTest.py
+++ b/tests/test_FindTest.py
@@ -3,13 +3,14 @@
 from datetime import datetime
 from unittest import TestCase
 
-from eyecite import clean_text, get_citations
+from eyecite import get_citations
 from eyecite.find import extract_reference_citations
 from eyecite.helpers import filter_citations
 
 # by default tests use a cache for speed
 # call tests with `EYECITE_CACHE_DIR= python ...` to disable cache
 from eyecite.models import (
+    Document,
     FullCaseCitation,
     ReferenceCitation,
     ResourceCitation,
@@ -60,15 +61,17 @@ def get_comparison_attrs(cite):
             tokenizers = tested_tokenizers
         for q, expected_cites, *kwargs in test_pairs:
             kwargs = kwargs[0] if kwargs else {}
-            clean_steps = kwargs.pop("clean", [])
-            clean_q = clean_text(q, clean_steps)
+            clean_steps = kwargs.get("clean_steps", [])
             for tokenizer in tokenizers:
                 with self.subTest(
                     message, tokenizer=type(tokenizer).__name__, q=q
                 ):
-                    cites_found = get_citations(
-                        clean_q, tokenizer=tokenizer, **kwargs
-                    )
+                    if "html" in clean_steps:
+                        kwargs["markup_text"] = q
+                    else:
+                        kwargs["plain_text"] = q
+
+                    cites_found = get_citations(tokenizer=tokenizer, **kwargs)
                     self.assertEqual(
                         [type(i) for i in cites_found],
                         [type(i) for i in expected_cites],
@@ -93,11 +96,11 @@ def test_find_citations(self):
             # Basic test with a line break
             ('1 U.S.\n1',
              [case_citation()],
-             {'clean': ['all_whitespace']}),
+             {'clean_steps': ['all_whitespace']}),
             # Basic test with a line break within a reporter
             ('1 U.\nS. 1',
              [case_citation(reporter_found='U. S.')],
-             {'clean': ['all_whitespace']}),
+             {'clean_steps': ['all_whitespace']}),
             # Basic test of non-case name before citation (should not be found)
             ('lissner test 1 U.S. 1',
              [case_citation()]),
@@ -259,7 +262,7 @@ def test_find_citations(self):
              [supra_citation("supra,",
                              metadata={'pin_cite': 'at 2',
                                        'antecedent_guess': 'asdf'})],
-             {'clean': ['all_whitespace']}),
+             {'clean_steps': ['all_whitespace']}),
             # Test short form citation with a page range
             ('before asdf, 1 U. S., at 20-25',
              [case_citation(page='20', reporter_found='U. S.', short=True,
@@ -383,7 +386,7 @@ def test_find_citations(self):
             # Test italicized Ibid. citation
             ('<p>before asdf. <i>Ibid.</i></p> <p>foo bar lorem</p>',
              [id_citation('Ibid.')],
-             {'clean': ['html', 'inline_whitespace']}),
+             {'clean_steps': ['html', 'inline_whitespace']}),
             # Test Id. citation
             ('foo v. bar 1 U.S. 12, 347-348. asdf. Id., at 123. foo bar',
              [case_citation(page='12',
@@ -399,15 +402,15 @@ def test_find_citations(self):
                                       'defendant': 'bar',
                                       'pin_cite': '347-348'}),
               id_citation('Id.,', metadata={'pin_cite': 'at 123'})],
-             {'clean': ['all_whitespace']}),
+             {'clean_steps': ['all_whitespace']}),
             # Test italicized Id. citation
             ('<p>before asdf. <i>Id.,</i> at 123.</p> <p>foo bar</p>',
              [id_citation('Id.,', metadata={'pin_cite': 'at 123'})],
-             {'clean': ['html', 'inline_whitespace']}),
+             {'clean_steps': ['html', 'inline_whitespace']}),
             # Test italicized Id. citation with another HTML tag in the way
             ('<p>before asdf. <i>Id.,</i> at <b>123.</b></p> <p>foo bar</p>',
              [id_citation('Id.,', metadata={'pin_cite': 'at 123'})],
-             {'clean': ['html', 'inline_whitespace']}),
+             {'clean_steps': ['html', 'inline_whitespace']}),
             # Test weirder Id. citations (#1344)
             ('foo v. bar 1 U.S. 12, 347-348. asdf. Id. ¶ 34. foo bar',
              [case_citation(page='12',
@@ -517,7 +520,7 @@ def test_find_citations(self):
                  metadata={'plaintiff': None,
                            'defendant': None,
                            'court': 'scotus'})],
-             {'clean': ['html', 'inline_whitespace']}),
+             {'clean_steps': ['html', 'inline_whitespace']}),
             # Test filtering overlapping citations - this finds four citations
             # but should filter down to three
             ("Miles v. Smith 1 Ga. 1; asdfasdf asd Something v. Else, 1 Miles 3; 1 Miles at 10",
@@ -1012,7 +1015,10 @@ def test_reference_extraction_using_resolved_names(self):
             citations = get_citations(plain_text)
             found_cite = citations[0]
             found_cite.metadata.resolved_case_name = "State v. Wingler"
-            references = extract_reference_citations(found_cite, plain_text)
+            document = Document(plain_text=plain_text, markup_text="")
+            references = extract_reference_citations(
+                citation=found_cite, document=document
+            )
             final_citations = filter_citations(citations + references)
             self.assertEqual(
                 len(final_citations), 2, "There should only be 2 citations"
@@ -1043,8 +1049,9 @@ def test_reference_extraction_from_markup(self):
         <i>ex post facto</i> scrutiny simply because it is consistent with
         punitive goals as well.\" 44 <i>F.</i>3d at 493.</p>"""
 
-        plain_text = clean_text(markup_text, ["html", "all_whitespace"])
-        citations = get_citations(plain_text, markup_text=markup_text)
+        citations = get_citations(
+            markup_text=markup_text, clean_steps=["html", "all_whitespace"]
+        )
         references = [c for c in citations if isinstance(c, ReferenceCitation)]
         # Tests both for the order and exact counts. Note that there is one
         # "Bae" in the text that should not be picked up: "Bae's argument"...
@@ -1083,8 +1090,9 @@ def test_reference_filtering(self):
             """,
         ]
         for markup_text in texts:
-            plain_text = clean_text(markup_text, ["html", "all_whitespace"])
-            citations = get_citations(plain_text, markup_text=markup_text)
+            citations = get_citations(
+                markup_text=markup_text, clean_steps=["html", "all_whitespace"]
+            )
             self.assertFalse(
                 any(
                     [isinstance(cite, ReferenceCitation) for cite in citations]
diff --git a/tests/test_ResolveTest.py b/tests/test_ResolveTest.py
index 6c4539c4..d40fcb1b 100644
--- a/tests/test_ResolveTest.py
+++ b/tests/test_ResolveTest.py
@@ -5,7 +5,7 @@
 from eyecite import get_citations
 from eyecite.find import extract_reference_citations
 from eyecite.helpers import filter_citations
-from eyecite.models import FullCitation, Resource
+from eyecite.models import Document, FullCitation, Resource
 from eyecite.resolve import resolve_citations
 
 
@@ -52,6 +52,10 @@ def checkReferenceResolution(
         Returns:
             None
         """
+
+        document = Document(
+            plain_text=citation_text,
+        )
         citations = get_citations(citation_text)
         if resolved_case_name_short:
             citations[0].metadata.resolved_case_name_short = (
@@ -59,7 +63,7 @@ def checkReferenceResolution(
             )
             citations.extend(
                 extract_reference_citations(
-                    citations[0], citation_text  # type: ignore[arg-type]
+                    citations[0], document  # type: ignore[arg-type]
                 )
             )
             citations = filter_citations(citations)

From de4cfd9b944af6bfa0a9891f83d456161d5fa575 Mon Sep 17 00:00:00 2001
From: William Palin <bill@free.law>
Date: Wed, 19 Mar 2025 13:44:45 -0400
Subject: [PATCH 2/6] chore(changes): Update changes.md

---
 CHANGES.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index a03c6f06..a30173a0 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -5,10 +5,13 @@
 The following changes are not yet released, but are code complete:
 
 Features:
-- None
+- Introduced `Document` object to encapsulate plain text, markup text, span updates, tokens, and citation strings.
+- Simplifies citation processing by reducing parameter passing and improving maintainability (hopefully).
+- Should enable more complex html parsing.
 
 Changes:
-- None
+- Moved text cleaning logic into `get_citations` for simpler call with markup
+- 
 
 Fixes:
 - Prefer the other full citation on overlap with nominative reporter 

From 5a3e813a06720861b978cbb2c07569e4af24f403 Mon Sep 17 00:00:00 2001
From: William Palin <bill@free.law>
Date: Thu, 20 Mar 2025 16:12:37 -0400
Subject: [PATCH 3/6] chore(find.py): Fix docstring

---
 eyecite/find.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/eyecite/find.py b/eyecite/find.py
index 04577753..75b5ef6e 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -389,13 +389,9 @@ def find_reference_citations_from_markup(
     Creating the SpanUpdaters for each full citation will be too slow,
     re-use them if possible
 
-    :param markup_text: HTML or XML source
-    :param plain_text: cleaned text
+    :param document: Document object we are parsing
     :param citations: list of citations found over plain text. The full cites
         will be used to access parties names metadata
-    :param plain_to_markup: a SpanUpdater from plain or clean text to
-        marked up text
-    :param markup_to_plain: a SpanUpdater from marked up text to plain text
 
     :return: a list of ReferenceCitations
     """

From 8a225d37f4ffa5938996ffd359ceda5cb33af48f Mon Sep 17 00:00:00 2001
From: William Palin <bill@free.law>
Date: Fri, 21 Mar 2025 09:30:46 -0400
Subject: [PATCH 4/6] fix(benchmark): Update benchmark for new command

Refactor call to use cleaning inside method
---
 benchmark/benchmark.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index f7e48f30..7414a2be 100644
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -52,19 +52,15 @@ def generate_branch_report(self, branch: str) -> None:
                 or row["html_anon_2020"]
                 or row["html"]
             )
+            params = {"clean_steps": ["html", "inline_whitespace"]}
             if text:
                 # Remove XML encodings from xml_harvard
                 text = re.sub(r"^<\?xml.*?\?>", "", text, count=1)
-                opinion_text_is_marked_up = True
+                params['markup_text'] = text or ""
             else:
-                text = row["plain_text"]
-                opinion_text_is_marked_up = False
+                params['markup_text'] = row['plain_text']
 
-            plain_text = clean_text(text, ["html", "inline_whitespace"])
-            found_citations = get_citations(
-                plain_text,
-                markup_text=text if opinion_text_is_marked_up else "",
-            )
+            found_citations = get_citations(**params)
 
             # Get the citation text string from the cite object
             cites = [cite.token.data for cite in found_citations if cite.token]

From e3415108c229890d7621d308c4f79d078ab2a661 Mon Sep 17 00:00:00 2001
From: William Palin <bill@free.law>
Date: Fri, 21 Mar 2025 09:42:15 -0400
Subject: [PATCH 5/6] fix(benchmark): Lint

---
 benchmark/benchmark.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
index 7414a2be..a951b7c2 100644
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@@ -8,10 +8,11 @@
 import sys
 from io import StringIO
 from pathlib import Path
+from typing import Any, Dict
 
 from matplotlib import pyplot as plt  # type: ignore
 
-from eyecite import clean_text, get_citations
+from eyecite import get_citations
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.dirname(SCRIPT_DIR))
@@ -45,20 +46,22 @@ def generate_branch_report(self, branch: str) -> None:
         now = datetime.datetime.now()
         data = []
         for row in csv_data:
-            text = (
+            text: str = (
                 row["xml_harvard"]
                 or row["html_lawbox"]
                 or row["html_columbia"]
                 or row["html_anon_2020"]
                 or row["html"]
             )
-            params = {"clean_steps": ["html", "inline_whitespace"]}
+            params: Dict[str, Any] = {
+                "clean_steps": ["html", "inline_whitespace"]
+            }
             if text:
                 # Remove XML encodings from xml_harvard
                 text = re.sub(r"^<\?xml.*?\?>", "", text, count=1)
-                params['markup_text'] = text or ""
+                params["markup_text"] = text or ""
             else:
-                params['markup_text'] = row['plain_text']
+                params["markup_text"] = row["plain_text"]
 
             found_citations = get_citations(**params)
 

From ba78e8e5857ef6a43e01984185359414b2da5b60 Mon Sep 17 00:00:00 2001
From: William Palin <bill@free.law>
Date: Fri, 21 Mar 2025 09:48:40 -0400
Subject: [PATCH 6/6] fix(find): Check only for references after full citation

---
 eyecite/find.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/eyecite/find.py b/eyecite/find.py
index 43ab31de..cef72d1c 100644
--- a/eyecite/find.py
+++ b/eyecite/find.py
@@ -95,10 +95,10 @@ def get_citations(
                     pre = cast(FullCaseCitation, citations[-1])  # type: ignore
                     citation.is_parallel_citation(pre)
 
-            # Check for reference citations that follow a full citation
-            # Using the plaintiff or defendant
-            references = extract_reference_citations(citation, document)
-            citations.extend(references)
+                # Check for reference citations that follow a full citation
+                # Using the plaintiff or defendant
+                references = extract_reference_citations(citation, document)
+                citations.extend(references)
 
         # CASE 2: Token is an "Id." or "Ibid." reference.
         # In this case, the citation should simply be to the item cited