freelawproject
diff --git a/‎CHANGES.md
+4 b/‎CHANGES.md
+4
diff --git a/‎benchmark/benchmark.py
+9-10 b/‎benchmark/benchmark.py
+9-10
diff --git a/‎eyecite/find.py
+45-66 b/‎eyecite/find.py
+45-66
diff --git a/‎eyecite/models.py
+40 b/‎eyecite/models.py
+40
@@ -5,9 +5,13 @@
 The following changes are not yet released, but are code complete:
 
 Features:
+- Introduced `Document` object to encapsulate plain text, markup text, span updates, tokens, and citation strings.
+- Simplifies citation processing by reducing parameter passing and improving maintainability (hopefully).
+- Should enable more complex html parsing.
 - Adds support for years preceding citations
 
 Changes:
+- Moved text cleaning logic into `get_citations` for simpler call with markup
 - Simplifies is parallel logic
 - moves is parallel citation to full case citation 
 
 
@@ -8,10 +8,11 @@
 import sys
 from io import StringIO
 from pathlib import Path
+from typing import Any, Dict
 
 from matplotlib import pyplot as plt  # type: ignore
 
-from eyecite import clean_text, get_citations
+from eyecite import get_citations
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.dirname(SCRIPT_DIR))
@@ -45,26 +46,24 @@ def generate_branch_report(self, branch: str) -> None:
         now = datetime.datetime.now()
         data = []
         for row in csv_data:
-            text = (
+            text: str = (
                 row["xml_harvard"]
                 or row["html_lawbox"]
                 or row["html_columbia"]
                 or row["html_anon_2020"]
                 or row["html"]
             )
+            params: Dict[str, Any] = {
+                "clean_steps": ["html", "inline_whitespace"]
+            }
             if text:
                 # Remove XML encodings from xml_harvard
                 text = re.sub(r"^<\?xml.*?\?>", "", text, count=1)
-                opinion_text_is_marked_up = True
+                params["markup_text"] = text or ""
             else:
-                text = row["plain_text"]
-                opinion_text_is_marked_up = False
+                params["markup_text"] = row["plain_text"]
 
-            plain_text = clean_text(text, ["html", "inline_whitespace"])
-            found_citations = get_citations(
-                plain_text,
-                markup_text=text if opinion_text_is_marked_up else "",
-            )
+            found_citations = get_citations(**params)
 
             # Get the citation text string from the cite object
             cites = [cite.token.data for cite in found_citations if cite.token]
 
@@ -1,8 +1,7 @@
 import re
 from bisect import bisect_left, bisect_right
-from typing import List, Optional, Type, cast
+from typing import Callable, Iterable, List, Optional, Type, Union, cast
 
-from eyecite.annotate import SpanUpdater
 from eyecite.helpers import (
     disambiguate_reporters,
     extract_pin_cite,
@@ -14,6 +13,7 @@
     CaseReferenceToken,
     CitationBase,
     CitationToken,
+    Document,
     FullCaseCitation,
     FullCitation,
     FullJournalCitation,
@@ -35,15 +35,16 @@
 
 
 def get_citations(
-    plain_text: str,
+    plain_text: str = "",
     remove_ambiguous: bool = False,
     tokenizer: Tokenizer = default_tokenizer,
     markup_text: str = "",
+    clean_steps: Optional[Iterable[Union[str, Callable[[str], str]]]] = None,
 ) -> List[CitationBase]:
     """This is eyecite's main workhorse function. Given a string of text
-    (e.g., a judicial opinion or other legal document), return a list of
+    (e.g., a judicial opinion or other legal doc), return a list of
     `eyecite.models.CitationBase` objects representing the citations found
-    in the document.
+    in the doc.
 
     Args:
         plain_text: The text to parse. You may wish to use the
@@ -57,23 +58,22 @@ def get_citations(
         markup_text: if the source text has markup (XML or HTML mostly), pass
             it to extract ReferenceCitations that may be detectable via
             markup style tags
+        clean_steps: Cleanup steps and methods
 
     Returns:
         A list of `eyecite.models.CitationBase` objects
     """
     if plain_text == "eyecite":
         return joke_cite
 
-    words, citation_tokens = tokenizer.tokenize(plain_text)
+    document = Document(
+        plain_text=plain_text,
+        markup_text=markup_text,
+        clean_steps=clean_steps,
+    )
+    document.tokenize(tokenizer=tokenizer)
     citations: list[CitationBase] = []
-
-    if markup_text:
-        plain_to_markup = SpanUpdater(plain_text, markup_text)
-        markup_to_plain = SpanUpdater(markup_text, plain_text)
-    else:
-        plain_to_markup, markup_to_plain = None, None
-
-    for i, token in citation_tokens:
+    for i, token in document.citation_tokens:
         citation: CitationBase
         token_type = type(token)
 
@@ -84,9 +84,9 @@ def get_citations(
         if token_type is CitationToken:
             citation_token = cast(CitationToken, token)
             if citation_token.short:
-                citation = _extract_shortform_citation(words, i)
+                citation = _extract_shortform_citation(document.words, i)
             else:
-                citation = _extract_full_citation(words, i)
+                citation = _extract_full_citation(document.words, i)
                 if (
                     citations
                     and isinstance(citation, FullCaseCitation)
@@ -97,28 +97,22 @@ def get_citations(
 
                 # Check for reference citations that follow a full citation
                 # Using the plaintiff or defendant
-                references = extract_reference_citations(
-                    citation,
-                    plain_text,
-                    markup_text,
-                    plain_to_markup,
-                    markup_to_plain,
-                )
+                references = extract_reference_citations(citation, document)
                 citations.extend(references)
 
         # CASE 2: Token is an "Id." or "Ibid." reference.
         # In this case, the citation should simply be to the item cited
         # immediately prior, but for safety we will leave that resolution up
         # to the user.
         elif token_type is IdToken:
-            citation = _extract_id_citation(words, i)
+            citation = _extract_id_citation(document.words, i)
 
         # CASE 3: Token is a "supra" reference.
         # In this case, we're not sure yet what the citation's antecedent is.
         # It could be any of the previous citations above. Thus, like an Id.
         # citation, for safety we won't resolve this reference yet.
         elif token_type is SupraToken:
-            citation = _extract_supra_citation(words, i)
+            citation = _extract_supra_citation(document.words, i)
 
         # CASE 4: Token is a section marker.
         # In this case, it's likely that this is a reference to a citation,
@@ -142,48 +136,36 @@ def get_citations(
         citations = disambiguate_reporters(citations)
 
     # Returns a list of citations ordered in the sequence that they appear in
-    # the document. The ordering of this list is important for reconstructing
+    # the doc. The ordering of this list is important for reconstructing
     # the references of the ShortCaseCitation, SupraCitation, and
     # IdCitation and ReferenceCitation objects.
     return citations
 
 
 def extract_reference_citations(
-    citation: FullCitation,
-    plain_text: str,
-    markup_text: str = "",
-    plain_to_markup: Optional[SpanUpdater] = None,
-    markup_to_plain: Optional[SpanUpdater] = None,
+    citation: ResourceCitation, document: Document
 ) -> List[ReferenceCitation]:
     """Extract reference citations that follow a full citation
 
     :param citation: the full case citation found
-    :param plain_text: the text
-    :param markup_text: optional argument for source text with XML style tags
-        that may help extracting name-only ReferenceCitations
-    :param plain_to_markup: a SpanUpdater from plain or clean text to
-        marked up text
-    :param markup_to_plain: a SpanUpdater from marked up text to plain text
+    :param document: document object to parse
 
     :return: Reference citations
     """
-    if len(plain_text) <= citation.span()[-1]:
+    if len(document.plain_text) <= citation.span()[-1]:
         return []
     if not isinstance(citation, FullCaseCitation):
         return []
 
     reference_citations = extract_pincited_reference_citations(
-        citation, plain_text
+        citation, document.plain_text
     )
 
-    if markup_text:
+    if document.markup_text:
         reference_citations.extend(
             find_reference_citations_from_markup(
-                markup_text,
-                plain_text,
+                document,
                 [citation],
-                plain_to_markup,
-                markup_to_plain,
             )
         )
 
@@ -397,11 +379,8 @@ def _extract_id_citation(
 
 
 def find_reference_citations_from_markup(
-    markup_text: str,
-    plain_text: str,
+    document: Document,
     citations: list,
-    plain_to_markup: Optional[SpanUpdater] = None,
-    markup_to_plain: Optional[SpanUpdater] = None,
 ) -> list[ReferenceCitation]:
     """Use HTML/XML style tags and parties names to find ReferenceCitations
 
@@ -415,21 +394,12 @@ def find_reference_citations_from_markup(
     Creating the SpanUpdaters for each full citation will be too slow,
     re-use them if possible
 
-    :param markup_text: HTML or XML source
-    :param plain_text: cleaned text
+    :param document: Document object we are parsing
     :param citations: list of citations found over plain text. The full cites
         will be used to access parties names metadata
-    :param plain_to_markup: a SpanUpdater from plain or clean text to
-        marked up text
-    :param markup_to_plain: a SpanUpdater from marked up text to plain text
 
     :return: a list of ReferenceCitations
     """
-    if not markup_to_plain:
-        markup_to_plain = SpanUpdater(markup_text, plain_text)
-    if not plain_to_markup:
-        plain_to_markup = SpanUpdater(plain_text, markup_text)
-
     references = []
     tags = "|".join(["em", "i"])
 
@@ -458,30 +428,39 @@ def find_reference_citations_from_markup(
         # `utils.maybe_balance_style tags` for reference; it has some tolerance
         # which may be enough for these citations
         regex = rf"<(?:{tags})>\s*({'|'.join(regexes)})[:;.,\s]*</(?:{tags})>"
-        start_in_markup = plain_to_markup.update(
+
+        if (
+            not document.plain_to_markup
+            or not document.markup_to_plain
+            or not document.markup_text
+        ):
+            # ensure we have markup text
+            return []
+        start_in_markup = document.plain_to_markup.update(
             citation.span()[0], bisect_right
         )
-        for match in re.finditer(regex, markup_text[start_in_markup:]):
-            full_start_in_plain = markup_to_plain.update(
+        for match in re.finditer(
+            regex, document.markup_text[start_in_markup:]
+        ):
+            full_start_in_plain = document.markup_to_plain.update(
                 start_in_markup + match.start(), bisect_left
             )
-            full_end_in_plain = markup_to_plain.update(
+            full_end_in_plain = document.markup_to_plain.update(
                 start_in_markup + match.end(), bisect_right
             )
 
             # the first group [match.group(0)] is the whole match,
             # with whitespace and punctuation. the second group, match.group(1)
             # is the only capturing and named group
-            start_in_plain = markup_to_plain.update(
+            start_in_plain = document.markup_to_plain.update(
                 start_in_markup + match.start(1), bisect_left
             )
-            end_in_plain = markup_to_plain.update(
+            end_in_plain = document.markup_to_plain.update(
                 start_in_markup + match.end(1), bisect_right
             )
-
             reference = ReferenceCitation(
                 token=CaseReferenceToken(
-                    data=plain_text[start_in_plain:end_in_plain],
+                    data=document.plain_text[start_in_plain:end_in_plain],
                     start=start_in_plain,
                     end=end_in_plain,
                 ),
 
@@ -7,6 +7,7 @@
     Callable,
     Dict,
     Hashable,
+    Iterable,
     List,
     Optional,
     Sequence,
@@ -15,6 +16,8 @@
     cast,
 )
 
+from eyecite import clean_text
+from eyecite.annotate import SpanUpdater
 from eyecite.utils import REPORTERS_THAT_NEED_PAGE_CORRECTION, hash_sha256
 
 ResourceType = Hashable
@@ -859,3 +862,40 @@ def __hash__(self):
 
     def __eq__(self, other):
         return self.__hash__() == other.__hash__()
+
+
+@dataclass(eq=False, unsafe_hash=False)
+class Document:
+    plain_text: str = ""
+    markup_text: Optional[str] = ""
+    citation_tokens: list[Tuple[int, Token]] = field(default_factory=list)
+    words: Tokens = field(default_factory=list)
+    plain_to_markup: Optional[SpanUpdater] = field(default=None, init=False)
+    markup_to_plain: Optional[SpanUpdater] = field(default=None, init=False)
+    clean_steps: Optional[Iterable[Union[str, Callable[[str], str]]]] = field(
+        default_factory=list
+    )
+
+    def __post_init__(self):
+        if self.plain_text and self.clean_steps:
+            self.plain_text = clean_text(self.plain_text, self.clean_steps)
+
+        if self.markup_text != "":
+            if "html" not in self.clean_steps:
+                raise (
+                    "`html` is a required cleanup step for markup text",
+                    self.markup_text,
+                )
+
+            self.plain_text = clean_text(self.markup_text, self.clean_steps)
+
+            self.plain_to_markup = SpanUpdater(
+                self.plain_text, self.markup_text
+            )
+            self.markup_to_plain = SpanUpdater(
+                self.markup_text, self.plain_text
+            )
+
+    def tokenize(self, tokenizer):
+        # Tokenize the document and store the results in the document object
+        self.words, self.citation_tokens = tokenizer.tokenize(self.plain_text)