1
1
import re
2
2
from bisect import bisect_left , bisect_right
3
- from typing import List , Optional , Type , cast
3
+ from typing import Callable , Iterable , List , Optional , Type , Union , cast
4
4
5
- from eyecite .annotate import SpanUpdater
6
5
from eyecite .helpers import (
7
6
disambiguate_reporters ,
8
7
extract_pin_cite ,
14
13
CaseReferenceToken ,
15
14
CitationBase ,
16
15
CitationToken ,
16
+ Document ,
17
17
FullCaseCitation ,
18
18
FullCitation ,
19
19
FullJournalCitation ,
35
35
36
36
37
37
def get_citations (
38
- plain_text : str ,
38
+ plain_text : str = "" ,
39
39
remove_ambiguous : bool = False ,
40
40
tokenizer : Tokenizer = default_tokenizer ,
41
41
markup_text : str = "" ,
42
+ clean_steps : Optional [Iterable [Union [str , Callable [[str ], str ]]]] = None ,
42
43
) -> List [CitationBase ]:
43
44
"""This is eyecite's main workhorse function. Given a string of text
44
- (e.g., a judicial opinion or other legal document ), return a list of
45
+ (e.g., a judicial opinion or other legal doc ), return a list of
45
46
`eyecite.models.CitationBase` objects representing the citations found
46
- in the document .
47
+ in the doc .
47
48
48
49
Args:
49
50
plain_text: The text to parse. You may wish to use the
@@ -57,23 +58,22 @@ def get_citations(
57
58
markup_text: if the source text has markup (XML or HTML mostly), pass
58
59
it to extract ReferenceCitations that may be detectable via
59
60
markup style tags
61
+ clean_steps: Cleanup steps and methods
60
62
61
63
Returns:
62
64
A list of `eyecite.models.CitationBase` objects
63
65
"""
64
66
if plain_text == "eyecite" :
65
67
return joke_cite
66
68
67
- words , citation_tokens = tokenizer .tokenize (plain_text )
69
+ document = Document (
70
+ plain_text = plain_text ,
71
+ markup_text = markup_text ,
72
+ clean_steps = clean_steps ,
73
+ )
74
+ document .tokenize (tokenizer = tokenizer )
68
75
citations : list [CitationBase ] = []
69
-
70
- if markup_text :
71
- plain_to_markup = SpanUpdater (plain_text , markup_text )
72
- markup_to_plain = SpanUpdater (markup_text , plain_text )
73
- else :
74
- plain_to_markup , markup_to_plain = None , None
75
-
76
- for i , token in citation_tokens :
76
+ for i , token in document .citation_tokens :
77
77
citation : CitationBase
78
78
token_type = type (token )
79
79
@@ -84,9 +84,9 @@ def get_citations(
84
84
if token_type is CitationToken :
85
85
citation_token = cast (CitationToken , token )
86
86
if citation_token .short :
87
- citation = _extract_shortform_citation (words , i )
87
+ citation = _extract_shortform_citation (document . words , i )
88
88
else :
89
- citation = _extract_full_citation (words , i )
89
+ citation = _extract_full_citation (document . words , i )
90
90
if (
91
91
citations
92
92
and isinstance (citation , FullCaseCitation )
@@ -97,28 +97,22 @@ def get_citations(
97
97
98
98
# Check for reference citations that follow a full citation
99
99
# Using the plaintiff or defendant
100
- references = extract_reference_citations (
101
- citation ,
102
- plain_text ,
103
- markup_text ,
104
- plain_to_markup ,
105
- markup_to_plain ,
106
- )
100
+ references = extract_reference_citations (citation , document )
107
101
citations .extend (references )
108
102
109
103
# CASE 2: Token is an "Id." or "Ibid." reference.
110
104
# In this case, the citation should simply be to the item cited
111
105
# immediately prior, but for safety we will leave that resolution up
112
106
# to the user.
113
107
elif token_type is IdToken :
114
- citation = _extract_id_citation (words , i )
108
+ citation = _extract_id_citation (document . words , i )
115
109
116
110
# CASE 3: Token is a "supra" reference.
117
111
# In this case, we're not sure yet what the citation's antecedent is.
118
112
# It could be any of the previous citations above. Thus, like an Id.
119
113
# citation, for safety we won't resolve this reference yet.
120
114
elif token_type is SupraToken :
121
- citation = _extract_supra_citation (words , i )
115
+ citation = _extract_supra_citation (document . words , i )
122
116
123
117
# CASE 4: Token is a section marker.
124
118
# In this case, it's likely that this is a reference to a citation,
@@ -142,48 +136,36 @@ def get_citations(
142
136
citations = disambiguate_reporters (citations )
143
137
144
138
# Returns a list of citations ordered in the sequence that they appear in
145
- # the document . The ordering of this list is important for reconstructing
139
+ # the doc . The ordering of this list is important for reconstructing
146
140
# the references of the ShortCaseCitation, SupraCitation, and
147
141
# IdCitation and ReferenceCitation objects.
148
142
return citations
149
143
150
144
151
145
def extract_reference_citations (
152
- citation : FullCitation ,
153
- plain_text : str ,
154
- markup_text : str = "" ,
155
- plain_to_markup : Optional [SpanUpdater ] = None ,
156
- markup_to_plain : Optional [SpanUpdater ] = None ,
146
+ citation : ResourceCitation , document : Document
157
147
) -> List [ReferenceCitation ]:
158
148
"""Extract reference citations that follow a full citation
159
149
160
150
:param citation: the full case citation found
161
- :param plain_text: the text
162
- :param markup_text: optional argument for source text with XML style tags
163
- that may help extracting name-only ReferenceCitations
164
- :param plain_to_markup: a SpanUpdater from plain or clean text to
165
- marked up text
166
- :param markup_to_plain: a SpanUpdater from marked up text to plain text
151
+ :param document: document object to parse
167
152
168
153
:return: Reference citations
169
154
"""
170
- if len (plain_text ) <= citation .span ()[- 1 ]:
155
+ if len (document . plain_text ) <= citation .span ()[- 1 ]:
171
156
return []
172
157
if not isinstance (citation , FullCaseCitation ):
173
158
return []
174
159
175
160
reference_citations = extract_pincited_reference_citations (
176
- citation , plain_text
161
+ citation , document . plain_text
177
162
)
178
163
179
- if markup_text :
164
+ if document . markup_text :
180
165
reference_citations .extend (
181
166
find_reference_citations_from_markup (
182
- markup_text ,
183
- plain_text ,
167
+ document ,
184
168
[citation ],
185
- plain_to_markup ,
186
- markup_to_plain ,
187
169
)
188
170
)
189
171
@@ -397,11 +379,8 @@ def _extract_id_citation(
397
379
398
380
399
381
def find_reference_citations_from_markup (
400
- markup_text : str ,
401
- plain_text : str ,
382
+ document : Document ,
402
383
citations : list ,
403
- plain_to_markup : Optional [SpanUpdater ] = None ,
404
- markup_to_plain : Optional [SpanUpdater ] = None ,
405
384
) -> list [ReferenceCitation ]:
406
385
"""Use HTML/XML style tags and parties names to find ReferenceCitations
407
386
@@ -415,21 +394,12 @@ def find_reference_citations_from_markup(
415
394
Creating the SpanUpdaters for each full citation will be too slow,
416
395
re-use them if possible
417
396
418
- :param markup_text: HTML or XML source
419
- :param plain_text: cleaned text
397
+ :param document: Document object we are parsing
420
398
:param citations: list of citations found over plain text. The full cites
421
399
will be used to access parties names metadata
422
- :param plain_to_markup: a SpanUpdater from plain or clean text to
423
- marked up text
424
- :param markup_to_plain: a SpanUpdater from marked up text to plain text
425
400
426
401
:return: a list of ReferenceCitations
427
402
"""
428
- if not markup_to_plain :
429
- markup_to_plain = SpanUpdater (markup_text , plain_text )
430
- if not plain_to_markup :
431
- plain_to_markup = SpanUpdater (plain_text , markup_text )
432
-
433
403
references = []
434
404
tags = "|" .join (["em" , "i" ])
435
405
@@ -458,30 +428,39 @@ def find_reference_citations_from_markup(
458
428
# `utils.maybe_balance_style tags` for reference; it has some tolerance
459
429
# which may be enough for these citations
460
430
regex = rf"<(?:{ tags } )>\s*({ '|' .join (regexes )} )[:;.,\s]*</(?:{ tags } )>"
461
- start_in_markup = plain_to_markup .update (
431
+
432
+ if (
433
+ not document .plain_to_markup
434
+ or not document .markup_to_plain
435
+ or not document .markup_text
436
+ ):
437
+ # ensure we have markup text
438
+ return []
439
+ start_in_markup = document .plain_to_markup .update (
462
440
citation .span ()[0 ], bisect_right
463
441
)
464
- for match in re .finditer (regex , markup_text [start_in_markup :]):
465
- full_start_in_plain = markup_to_plain .update (
442
+ for match in re .finditer (
443
+ regex , document .markup_text [start_in_markup :]
444
+ ):
445
+ full_start_in_plain = document .markup_to_plain .update (
466
446
start_in_markup + match .start (), bisect_left
467
447
)
468
- full_end_in_plain = markup_to_plain .update (
448
+ full_end_in_plain = document . markup_to_plain .update (
469
449
start_in_markup + match .end (), bisect_right
470
450
)
471
451
472
452
# the first group [match.group(0)] is the whole match,
473
453
# with whitespace and punctuation. the second group, match.group(1)
474
454
# is the only capturing and named group
475
- start_in_plain = markup_to_plain .update (
455
+ start_in_plain = document . markup_to_plain .update (
476
456
start_in_markup + match .start (1 ), bisect_left
477
457
)
478
- end_in_plain = markup_to_plain .update (
458
+ end_in_plain = document . markup_to_plain .update (
479
459
start_in_markup + match .end (1 ), bisect_right
480
460
)
481
-
482
461
reference = ReferenceCitation (
483
462
token = CaseReferenceToken (
484
- data = plain_text [start_in_plain :end_in_plain ],
463
+ data = document . plain_text [start_in_plain :end_in_plain ],
485
464
start = start_in_plain ,
486
465
end = end_in_plain ,
487
466
),
0 commit comments