@@ -413,11 +413,12 @@ def find_case_name_in_html(
413
413
defendant , _ = convert_html_to_plain_text_and_loc (
414
414
document , defendant_tags
415
415
)
416
-
417
416
clean_plaintiff = strip_stop_words (plaintiff )
418
417
419
- citation .metadata .plaintiff = clean_plaintiff
420
- citation .metadata .defendant = strip_stop_words (defendant )
418
+ citation .metadata .plaintiff = clean_plaintiff .strip ().strip ("," )
419
+ citation .metadata .defendant = (
420
+ strip_stop_words (defendant ).strip ().strip ("," )
421
+ )
421
422
422
423
# Update full span start accordingly
423
424
if len (clean_plaintiff ) != len (plaintiff ):
@@ -431,9 +432,10 @@ def find_case_name_in_html(
431
432
# stopped at a stop word, work forward to possible title
432
433
# this should be at least two words (including whitespace)
433
434
# but with html could be more.
434
- shift = index + 2
435
+ # shift = index + 2
436
+ shift = 3
435
437
while True :
436
- if words [shift ] == " " :
438
+ if words [index + shift ] == " " :
437
439
shift += 1
438
440
else :
439
441
break
@@ -443,14 +445,16 @@ def find_case_name_in_html(
443
445
loc = word .start + right_offset - 1
444
446
# find a character in the word
445
447
filtered_tags = find_html_tags_at_position (document , loc )
446
-
447
448
if len (filtered_tags ) != 1 :
448
449
return None
449
450
450
451
defendant , start = convert_html_to_plain_text_and_loc (
451
452
document , filtered_tags
452
453
)
453
- citation .metadata .defendant = strip_stop_words (defendant )
454
+
455
+ citation .metadata .defendant = strip_stop_words (defendant ).strip (
456
+ ", "
457
+ )
454
458
citation .full_span_start = start
455
459
return
456
460
@@ -484,6 +488,7 @@ def convert_html_to_plain_text_and_loc(
484
488
Returns: The text of the plain text and the location it starts
485
489
"""
486
490
markup_location = results [0 ]
491
+
487
492
start = document .markup_to_plain .update ( # type: ignore
488
493
markup_location [1 ],
489
494
bisect_right ,
@@ -493,6 +498,7 @@ def convert_html_to_plain_text_and_loc(
493
498
bisect_right ,
494
499
)
495
500
case_name = document .plain_text [start :end ]
501
+ # print("------->", case_name, start, markup_location)
496
502
return (case_name , start )
497
503
498
504
0 commit comments