Skip to content

Commit 39142e5

Browse files
committed
fix(annotate_citations): try to include HTML style tags if not balanced
Some annotations, specially for ReferenceCitations, are discarded in HTML sources because some style tags (mostly i or em) are not balanced. This PR tries to include the style tags in the citation span - Adds tests for `utils.maybe_balance_style_tags` Solves #196
1 parent abfc7f7 commit 39142e5

File tree

3 files changed

+161
-12
lines changed

3 files changed

+161
-12
lines changed

eyecite/annotate.py

+15-9
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55

66
import fast_diff_match_patch
77

8-
from eyecite.utils import is_balanced_html, wrap_html_tags
9-
8+
from eyecite.utils import is_balanced_html, wrap_html_tags, maybe_balance_style_tags
9+
from logging import getLogger
10+
logger = getLogger("eyecite")
1011

1112
def annotate_citations(
1213
plain_text: str,
@@ -59,6 +60,9 @@ def annotate_citations(
5960
Returns:
6061
The annotated text.
6162
"""
63+
if unbalanced_tags not in ["unchecked", "skip", "wrap"]:
64+
raise ValueError(f"Unknown option '{unbalanced_tags}")
65+
6266
# set up offset_updater if we have to move annotations to source_text
6367
offset_updater = None
6468
if source_text and source_text != plain_text:
@@ -88,14 +92,16 @@ def annotate_citations(
8892
# handle HTML tags
8993
if unbalanced_tags == "unchecked":
9094
pass
91-
elif unbalanced_tags in ("skip", "wrap"):
92-
if not is_balanced_html(span_text):
93-
if unbalanced_tags == "skip":
94-
continue
95+
elif not is_balanced_html(span_text):
96+
if unbalanced_tags == "wrap":
9597
span_text = wrap_html_tags(span_text, after, before)
96-
else:
97-
raise ValueError(f"Unknown option '{unbalanced_tags}")
98-
98+
else: # "skip" case
99+
original_span_text = span_text
100+
start, end, span_text = maybe_balance_style_tags(start, end, plain_text)
101+
if not is_balanced_html(span_text):
102+
logger.error("Citation was not annotated due to unbalanced tags %s", original_span_text)
103+
continue
104+
99105
if annotator is not None:
100106
annotated_span = annotator(before, span_text, after)
101107
else:

eyecite/utils.py

+44
Original file line numberDiff line numberDiff line change
@@ -130,3 +130,47 @@ def hash_sha256(dictionary: dict) -> int:
130130

131131
# Calculate the hash of the bytes, convert to an int, and return
132132
return int.from_bytes(hashlib.sha256(json_bytes).digest(), byteorder="big")
133+
134+
135+
def maybe_balance_style_tags(start:int, end:int, plain_text:str) -> tuple[int, int, str]:
136+
"""Try to include style tags at the edge of the span marked as invalid
137+
138+
In some HTML sources the citations are styled with tags like <i> or <em>
139+
When the citation is found in a stripped-of-tags text, the span may
140+
leave out the opening or closing tag. When this happens and we try to
141+
annotate the HTML, it will render invalid HTML. This happens mostly with
142+
IdCitation, ReferenceCitation, etc.
143+
144+
This function will try to find opening or closing tags inmediately
145+
preceding or following the citation span. If it finds them, it will
146+
return the new start, end and span. If not, it will return the old ones
147+
148+
:param start: the original start of the span
149+
:param end: the origina end of the span
150+
:param plain_text: the text to annotate
151+
:return: a tuple (new start, new end, new span text)
152+
"""
153+
span_text = plain_text[start:end]
154+
style_tags = ["i", "em", "b"]
155+
tolerance = 5 # tolerate at most this amount of whitespace
156+
157+
for tag in style_tags:
158+
opening_tag = f"<{tag}>"
159+
closing_tag = f"</{tag}>"
160+
has_opening = opening_tag in span_text
161+
has_closing = closing_tag in span_text
162+
if has_opening and not has_closing:
163+
# look for closing tag after the end
164+
extended_end = max(end + len(closing_tag) + tolerance, len(plain_text))
165+
if end_match := re.search(rf"{span_text}\s*{closing_tag}", plain_text[start:extended_end], flags=re.MULTILINE):
166+
end = start + end_match.end()
167+
168+
if not has_opening and has_closing:
169+
# look for opening tag before the start
170+
extended_start = min(start - len(opening_tag) - tolerance, 0)
171+
if start_match := re.search(rf"{opening_tag}\s*{span_text}", plain_text[extended_start:end], flags=re.MULTILINE):
172+
start = extended_start + start_match.start()
173+
174+
return start, end, plain_text[start:end]
175+
176+

tests/test_AnnotateTest.py

+102-3
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def straighten_quotes(text):
1111

1212
def lower_annotator(before, text, after):
1313
return before + text.lower() + after
14-
14+
self.maxDiff=None
1515
test_pairs = (
1616
# single cite
1717
("1 U.S. 1", "<0>1 U.S. 1</0>", []),
@@ -59,10 +59,10 @@ def lower_annotator(before, text, after):
5959
"<body>foo <i><0>1 <b>U.S.</b></i> 1</0> bar</body>",
6060
["html", "inline_whitespace"],
6161
),
62-
# whitespace and html -- skip unbalanced tags
62+
# whitespace and html -- unbalanced tags are repaired
6363
(
6464
"foo <i>1 U.S.</i> 1; 2 <i>U.S.</i> 2",
65-
"foo <i>1 U.S.</i> 1; <1>2 <i>U.S.</i> 2</1>",
65+
"foo <0><i>1 U.S.</i> 1</0>; <1>2 <i>U.S.</i> 2</1>",
6666
["html", "inline_whitespace"],
6767
{"unbalanced_tags": "skip"},
6868
),
@@ -101,6 +101,98 @@ def lower_annotator(before, text, after):
101101
[],
102102
{"annotator": lower_annotator},
103103
),
104+
# solvable unbalanced <em> tag. Need the FullCaseCitation first
105+
# so the ReferenceCitation can be found
106+
# from https://www.courtlistener.com/api/rest/v4/opinions/8496639/
107+
# source: Opinion.xml_harvard
108+
(
109+
" partially secured by a debtor’s principal residence was not "
110+
"con-firmable. <em>Nobelman v. Am. Sav. Bank, </em>"
111+
"508 U.S. 324, 113 S.Ct. 2106, 124 L.Ed.2d 228 (1993). That "
112+
"plan proposed to bifurcate the claim and... pay the unsecured"
113+
"... only by a lien on the debtor’s principal residence.” "
114+
"<em>Nobelman </em>at 332, 113 S.Ct. 2106. Section 1123(b)(5) "
115+
"codifies the <em>Nobelman </em>decision in individual debtor "
116+
"chapter 11 cases.",
117+
" partially secured by a debtor’s principal residence was not"
118+
" con-firmable. <em>Nobelman v. Am. Sav. Bank, </em>"
119+
"<a href='something'>508 U.S. 324</a>, <a href='something'>"
120+
"113 S.Ct. 2106</a>, <a href='something'>124 L.Ed.2d 228</a>"
121+
" (1993). That plan proposed to bifurcate the claim and..."
122+
" pay the unsecured... only by a lien on the debtor’s"
123+
" principal residence.” <a href='something'><em>Nobelman </em>"
124+
"at 332</a>, <a href='something'>113 S.Ct. 2106</a>. Section"
125+
" 1123(b)(5) codifies the <em>Nobelman </em>decision in"
126+
" individual debtor chapter 11 cases.",
127+
["html", "all_whitespace"],
128+
{"annotate_anchors": True, "unbalanced_tags": "skip"}
129+
),
130+
# solvable unbalanced <i> tag
131+
# from https://www.courtlistener.com/api/rest/v4/opinions/2841253/
132+
# source: Opinion.html
133+
(
134+
"he has not agreed so to submit.’” <i>Howsam v. Dean"
135+
" Witter Reynolds, Inc.</i>, 537 U.S. 79, 83, 123 S. Ct."
136+
" 588, 591 (2002) (combined mandamus and"
137+
" interlocutory appeal) (citing <i>Howsam</i> at 84, 123"
138+
" S. Ct. at 592)",
139+
140+
"he has not agreed so to submit.’” <i>Howsam v. Dean"
141+
" Witter Reynolds, Inc.</i>, <a href='something'>537 U.S."
142+
" 79</a>, 83, <a href='something'>123 S. Ct. 588</a>, 591"
143+
" (2002) (combined mandamus and interlocutory appeal)"
144+
" (citing <a href='something'><i>Howsam</i> at 84</a>, <a"
145+
" href='something'>123 S. Ct. at 592</a>)",
146+
147+
["html", "all_whitespace"],
148+
{"annotate_anchors": True, "unbalanced_tags": "skip"}
149+
),
150+
# The next 2 examples could be resolved if we increased the
151+
# character tolerance or admitted the full case name instead of
152+
# just one of the parties
153+
(
154+
# https://www.courtlistener.com/api/rest/v4/opinions/1535649/
155+
# source: xml_harvard
156+
"See also Styler v. Tall Oaks, Inc. (In re Hatch),"
157+
" 93 B.R. 263, 267 (Bankr.D. Utah 1988),"
158+
" <em> rev'd </em> 114 B.R. 747 (D.Utah 1989)."
159+
"</p>... The court makes no"
160+
" determination as to whe Fifth Amendment to the"
161+
" constitution of the United States.” <em> Styler v."
162+
" Tall Oaks, Inc. (In re Hatch), </em> at 748."
163+
"</p>",
164+
"See also Styler v. Tall Oaks, Inc. (In re Hatch),"
165+
" <a href='something'>93 B.R. 263</a>, 267"
166+
" (Bankr.D. Utah 1988), <em> rev'd </em> <a"
167+
" href='something'>114 B.R. 747</a> (D.Utah 1989)."
168+
"</p>... The court makes no"
169+
" determination as to whe Fifth Amendment to the"
170+
" constitution of the United States.” <em> Styler v."
171+
" Tall Oaks, Inc. (In re Hatch), </em> at 748."
172+
"</p>",
173+
["html", "all_whitespace"],
174+
{"annotate_anchors": True, "unbalanced_tags": "skip"}
175+
),
176+
(
177+
# https://www.courtlistener.com/api/rest/v4/opinions/1985850/
178+
# source: html_lawbox
179+
"to act rationally. <i>See, e.g., </i><i>State v."
180+
" Wingler,</i> 25 <i>N.J.</i> 161, 175, 135 <i>A.</i>2d"
181+
" 468 (1957); <i>citing, ... have been applied.'"
182+
" [<i>State v. Wingler</i> at 175, 135 <i>A.</i>2d"
183+
" 468, <i>citing, </i><i>Minnesota ex rel.</i>",
184+
185+
"to act rationally. <i>See, e.g., </i><i>State v."
186+
" Wingler,</i> <a href='something'>25 <i>N.J.</i>"
187+
" 161</a>, 175, <a href='something'>135 <i>A.</i>2d"
188+
" 468</a> (1957); <i>citing, ... have been applied.'"
189+
" [<i>State v. Wingler</i> at 175, <a"
190+
" href='something'>135 <i>A.</i>2d 468</a>, <i>citing,"
191+
" </i><i>Minnesota ex rel.</i>",
192+
["html", "all_whitespace"],
193+
{"annotate_anchors": True, "unbalanced_tags": "skip"},
194+
)
195+
104196
)
105197
for source_text, expected, clean_steps, *annotate_kwargs in test_pairs:
106198
annotate_kwargs = annotate_kwargs[0] if annotate_kwargs else {}
@@ -115,6 +207,13 @@ def lower_annotator(before, text, after):
115207
(c.span(), f"<{i}>", f"</{i}>")
116208
for i, c in enumerate(cites)
117209
]
210+
211+
if annotate_kwargs.pop("annotate_anchors", False):
212+
annotations = [
213+
(c.span(), "<a href='something'>", "</a>")
214+
for c in cites
215+
]
216+
118217
annotated = annotate_citations(
119218
plain_text,
120219
annotations,

0 commit comments

Comments
 (0)