Skip to content

Commit f6c6e89

Browse files
committed
feat(eyecite): Improve name extraction
Add check inside method for plaintiff Use that to identify the full plaintiff name Also improve regex building to avoid unnecessary escapes
1 parent 7f49660 commit f6c6e89

File tree

4 files changed

+255
-20
lines changed

4 files changed

+255
-20
lines changed

eyecite/find.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def get_citations(
8686
if citation_token.short:
8787
citation = _extract_shortform_citation(document.words, i)
8888
else:
89-
citation = _extract_full_citation(document.words, i)
89+
citation = _extract_full_citation(document, i)
9090
if (
9191
citations
9292
and isinstance(citation, FullCaseCitation)
@@ -215,7 +215,7 @@ def extract_pincited_reference_citations(
215215

216216

217217
def _extract_full_citation(
218-
words: Tokens,
218+
document: Document,
219219
index: int,
220220
) -> FullCitation:
221221
"""Given a list of words and the index of a citation, return
@@ -225,7 +225,7 @@ def _extract_full_citation(
225225
# one or more of the sources in reporters_db (e.g. reporters, laws,
226226
# journals). Get the set of all sources that matched, preferring exact
227227
# matches to variations:
228-
token = cast(CitationToken, words[index])
228+
token = cast(CitationToken, document.words[index])
229229
cite_sources = set(
230230
e.reporter.source
231231
for e in (token.exact_editions or token.variation_editions)
@@ -249,7 +249,7 @@ def _extract_full_citation(
249249
exact_editions=token.exact_editions,
250250
variation_editions=token.variation_editions,
251251
)
252-
citation.add_metadata(words)
252+
citation.add_metadata(document)
253253

254254
return citation
255255

@@ -413,10 +413,10 @@ def find_reference_citations_from_markup(
413413
continue
414414
if not is_valid_name(value):
415415
continue
416-
value = re.sub(r"\s+", re.escape(" "), re.escape(value.strip()))
417-
regexes.append(
418-
r"(?P<{}>{})".format(key, value.replace(" ", r"\s+"))
416+
regex_value = r"\s+".join(
417+
re.escape(token) for token in value.strip().split()
419418
)
419+
regexes.append(r"(?P<{}>{})".format(key, regex_value))
420420
if not regexes:
421421
continue
422422

eyecite/helpers.py

+45-1
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
import logging
2+
from bisect import bisect_right
23
from datetime import date
34
from typing import List, Optional, Tuple, cast
45

56
import regex as re
7+
from bs4 import BeautifulSoup
68
from courts_db import courts
79

810
from eyecite.models import (
911
CaseCitation,
1012
CitationBase,
1113
CitationToken,
14+
Document,
1215
FullCaseCitation,
1316
FullJournalCitation,
1417
FullLawCitation,
@@ -129,13 +132,50 @@ def add_post_citation(citation: CaseCitation, words: Tokens) -> None:
129132
citation.metadata.court = get_court_by_paren(m["court"])
130133

131134

132-
def add_defendant(citation: CaseCitation, words: Tokens) -> None:
135+
def update_plaintiff_from_markup(document, citation, offset) -> None:
136+
"""Update plaintiff if in Markup
137+
138+
Check if the plaintiff is inside a markup tag and complete the tag text
139+
Args:
140+
document: An object containing the document's plain text; markup text
141+
citation: An object representing a citation
142+
offset: An integer offset used to adjust the citation's span
143+
144+
Returns: None
145+
146+
"""
147+
if document.plain_to_markup is None:
148+
raise ValueError("document.plain_to_markup must not be None")
149+
150+
start = citation.span()[0] - offset
151+
end = start + len(citation.metadata.plaintiff)
152+
153+
m_start = document.plain_to_markup.update(start, bisect_right)
154+
m_end = document.plain_to_markup.update(end, bisect_right)
155+
156+
soup = BeautifulSoup(document.markup_text, "html.parser")
157+
for tag in soup.find_all(["em", "i"]):
158+
# Convert the tag back to a string.
159+
tag_html = str(tag)
160+
tag_start = document.markup_text.find(tag_html)
161+
if tag_start == -1:
162+
continue
163+
tag_end = tag_start + len(tag_html)
164+
# If the target text is entirely within the tag boundaries, return it.
165+
if tag_start <= m_start and m_end <= tag_end:
166+
t_start = document.markup_to_plain.update(tag_start, bisect_right)
167+
citation.metadata.plaintiff = document.plain_text[t_start:end]
168+
break
169+
170+
171+
def add_defendant(citation: CaseCitation, document: Document) -> None:
133172
"""Scan backwards from reporter until you find v., in re,
134173
etc. If no known stop-token is found, no defendant name is stored. In the
135174
future, this could be improved.
136175
"""
137176
# To turn word indexing into char indexing,
138177
# useful for span, account for shift
178+
words = document.words
139179
offset = 0
140180
start_index = None
141181
back_seek = citation.index - BACKWARD_SEEK
@@ -151,6 +191,10 @@ def add_defendant(citation: CaseCitation, words: Tokens) -> None:
151191
str(w) for w in words[max(index - 2, 0) : index]
152192
).strip("( ")
153193
offset += len(citation.metadata.plaintiff) + 1
194+
195+
if document.markup_text:
196+
update_plaintiff_from_markup(document, citation, offset)
197+
154198
else:
155199
# We don't want to include stop words such as
156200
# 'citing' in the span

eyecite/models.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ class Metadata(CitationBase.Metadata):
291291

292292
year: Optional[str] = None
293293

294-
def add_metadata(self, words: "Tokens"):
294+
def add_metadata(self, document: "Document"):
295295
"""Extract metadata from text before and after citation."""
296296
self.guess_edition()
297297

@@ -375,13 +375,13 @@ class Metadata(FullCitation.Metadata):
375375
day: Optional[str] = None
376376
month: Optional[str] = None
377377

378-
def add_metadata(self, words: "Tokens"):
378+
def add_metadata(self, document: "Document"):
379379
"""Extract metadata from text before and after citation."""
380380
# pylint: disable=import-outside-toplevel
381381
from eyecite.helpers import add_law_metadata
382382

383-
add_law_metadata(self, words)
384-
super().add_metadata(words)
383+
add_law_metadata(self, document.words)
384+
super().add_metadata(document)
385385

386386
def corrected_citation_full(self):
387387
"""Return citation with any variations normalized, including extracted
@@ -404,13 +404,13 @@ def corrected_citation_full(self):
404404
class FullJournalCitation(FullCitation):
405405
"""Citation to a source from `reporters_db/journals.json`."""
406406

407-
def add_metadata(self, words: "Tokens"):
407+
def add_metadata(self, document: "Document"):
408408
"""Extract metadata from text before and after citation."""
409409
# pylint: disable=import-outside-toplevel
410410
from eyecite.helpers import add_journal_metadata
411411

412-
add_journal_metadata(self, words)
413-
super().add_metadata(words)
412+
add_journal_metadata(self, document.words)
413+
super().add_metadata(document)
414414

415415
def corrected_citation_full(self):
416416
"""Return citation with any variations normalized, including extracted
@@ -521,7 +521,7 @@ class Metadata(CaseCitation.Metadata):
521521
resolved_case_name_short: Optional[str] = None
522522
resolved_case_name: Optional[str] = None
523523

524-
def add_metadata(self, words: "Tokens"):
524+
def add_metadata(self, document: "Document"):
525525
"""Extract metadata from text before and after citation."""
526526
# pylint: disable=import-outside-toplevel
527527
from eyecite.helpers import (
@@ -530,12 +530,12 @@ def add_metadata(self, words: "Tokens"):
530530
add_pre_citation,
531531
)
532532

533-
add_post_citation(self, words)
534-
add_defendant(self, words)
535-
add_pre_citation(self, words)
533+
add_post_citation(self, document.words)
534+
add_defendant(self, document)
535+
add_pre_citation(self, document.words)
536536

537537
self.guess_court()
538-
super().add_metadata(words)
538+
super().add_metadata(document)
539539

540540
def corrected_citation_full(self):
541541
"""Return formatted version of extracted cite."""

tests/test_FindTest.py

+191
Original file line numberDiff line numberDiff line change
@@ -1121,3 +1121,194 @@ def test_reference_filtering(self):
11211121
[isinstance(cite, ReferenceCitation) for cite in citations]
11221122
)
11231123
)
1124+
1125+
def test_unbalanced_case_names(self) -> None:
1126+
# Can we identify full case names in markup text
1127+
test_pairs = (
1128+
# Case Name unbalanced across two tags
1129+
(
1130+
(
1131+
"and more and more <em>Jin Fuey Moy</em><em>v. United States,</em>\n"
1132+
" 254 U.S. 189. Petitioner contends"
1133+
),
1134+
[
1135+
case_citation(
1136+
volume="254",
1137+
reporter="U.S.",
1138+
page="189",
1139+
metadata={
1140+
"plaintiff": "Jin Fuey Moy",
1141+
"defendant": "United States",
1142+
},
1143+
)
1144+
],
1145+
{"clean_steps": ["html", "all_whitespace"]},
1146+
),
1147+
# Extract from one tag and ignore the other
1148+
(
1149+
(
1150+
"<em>Overruled</em> and so on <em>Jin Fuey Moy v. United States,</em> "
1151+
"254 U.S. 189. Petitioner contends"
1152+
),
1153+
[
1154+
case_citation(
1155+
volume="254",
1156+
reporter="U.S.",
1157+
page="189",
1158+
metadata={
1159+
"plaintiff": "Jin Fuey Moy",
1160+
"defendant": "United States",
1161+
},
1162+
)
1163+
],
1164+
{"clean_steps": ["html", "all_whitespace"]},
1165+
),
1166+
# corporation name
1167+
(
1168+
"<em>Bell Atlantic Corp. </em>v. <em>Twombly, </em>550 U. S. 544 (2007),",
1169+
[
1170+
case_citation(
1171+
volume="550",
1172+
reporter="U. S.",
1173+
page="544",
1174+
year=2007,
1175+
metadata={
1176+
"plaintiff": "Bell Atlantic Corp.",
1177+
"defendant": "Twombly",
1178+
"year": "2007",
1179+
"court": "scotus",
1180+
},
1181+
)
1182+
],
1183+
{"clean_steps": ["html", "all_whitespace"]},
1184+
),
1185+
# two word plaintiff
1186+
(
1187+
"con-firmable. <em>United States v. Am. Sav. Bank, </em> 508 U.S. 324 (1993). That plan "
1188+
"proposed to bifurcate the claim and",
1189+
[
1190+
case_citation(
1191+
volume="508",
1192+
reporter="U.S.",
1193+
page="324",
1194+
year=1993,
1195+
metadata={
1196+
"plaintiff": "United States",
1197+
"defendant": "Am. Sav. Bank",
1198+
"year": "1993",
1199+
"court": "scotus",
1200+
},
1201+
)
1202+
],
1203+
{"clean_steps": ["html", "all_whitespace"]},
1204+
),
1205+
# Extract reference citation full name
1206+
(
1207+
(
1208+
". <em>Jin Fuey Moy</em> <em>v. United States,</em> 254 U.S. 189. Petitioner contends. "
1209+
"Regardless in <em>Jin Fuey Moy</em> the court ruled"
1210+
),
1211+
[
1212+
case_citation(
1213+
volume="254",
1214+
reporter="U.S.",
1215+
page="189",
1216+
metadata={
1217+
"plaintiff": "Jin Fuey Moy",
1218+
"defendant": "United States",
1219+
},
1220+
),
1221+
reference_citation(
1222+
"Jin Fuey Moy", metadata={"plaintiff": "Jin Fuey Moy"}
1223+
),
1224+
],
1225+
{"clean_steps": ["html", "all_whitespace"]},
1226+
),
1227+
# Extract out with whitespace across two tags
1228+
(
1229+
(
1230+
'<p id="b453-6">\n'
1231+
" The supreme court of Connecticut, in\n"
1232+
" <em>\n"
1233+
" Beardsley\n"
1234+
" </em>\n"
1235+
" v.\n"
1236+
" <em>\n"
1237+
" Hartford,\n"
1238+
" </em>\n"
1239+
" 50 Conn. 529, 541-542, after quoting the maxim of the common law;\n"
1240+
" <em>\n"
1241+
" cessante ratione legis-, cessat ipsa lex,\n"
1242+
" </em>"
1243+
),
1244+
[
1245+
case_citation(
1246+
volume="50",
1247+
reporter="Conn.",
1248+
page="529",
1249+
metadata={
1250+
"plaintiff": "Beardsley",
1251+
"defendant": "Hartford",
1252+
"pin_cite": "541-542",
1253+
},
1254+
)
1255+
],
1256+
{"clean_steps": ["html", "all_whitespace"]},
1257+
),
1258+
# identify reference
1259+
(
1260+
(
1261+
" partially secured by a debtor’s principal residence was not "
1262+
"con-firmable. <em>Smart Nobelman v. Am. Sav. Bank, </em>"
1263+
"508 U.S. 324 (1993). That plan proposed to bifurcate the claim and... pay the unsecured"
1264+
"... only by a lien on the debtor’s principal residence.” "
1265+
"codifies the <em>Smart Nobelman </em>decision in individual debtor chapter 11 cases."
1266+
),
1267+
[
1268+
case_citation(
1269+
volume="508",
1270+
reporter="U.S.",
1271+
page="324",
1272+
metadata={
1273+
"plaintiff": "Smart Nobelman",
1274+
"defendant": "Am. Sav. Bank",
1275+
"year": "1993",
1276+
},
1277+
),
1278+
reference_citation(
1279+
"Smart Nobelman",
1280+
metadata={"plaintiff": "Smart Nobelman"},
1281+
),
1282+
],
1283+
{"clean_steps": ["html", "all_whitespace"]},
1284+
),
1285+
# Identify pincite reference
1286+
(
1287+
(
1288+
" partially secured by a debtor’s principal residence was not "
1289+
"con-firmable. <em>Nobelman v. Am. Sav. Bank, </em>"
1290+
"508 U.S. 324 (1993). That plan proposed to bifurcate the claim and... pay the unsecured"
1291+
"... only by a lien on the debtor’s principal residence.” "
1292+
"codifies the a lien on the debtor’s principal residence.” "
1293+
"<em>Nobelman </em>at 332, decision in individual debtor chapter 11 cases."
1294+
),
1295+
[
1296+
case_citation(
1297+
volume="508",
1298+
reporter="U.S.",
1299+
page="324",
1300+
metadata={
1301+
"plaintiff": "Nobelman",
1302+
"defendant": "Am. Sav. Bank",
1303+
"year": "1993",
1304+
},
1305+
),
1306+
reference_citation(
1307+
"Nobelman",
1308+
metadata={"plaintiff": "Nobelman", "pin_cite": "332"},
1309+
),
1310+
],
1311+
{"clean_steps": ["html", "all_whitespace"]},
1312+
),
1313+
)
1314+
self.run_test_pairs(test_pairs, "Citation extraction")

0 commit comments

Comments
 (0)