Skip to content

Commit 762fc1f

Browse files
authored
ROB: Continue parsing dictionary object when error is detected (#2872)
Closes #2866.
1 parent e959073 commit 762fc1f

File tree

2 files changed

+63
-2
lines changed

2 files changed

+63
-2
lines changed

pypdf/generic/_data_structures.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
logger_warning,
5858
read_non_whitespace,
5959
read_until_regex,
60+
read_until_whitespace,
6061
skip_over_comment,
6162
)
6263
from ..constants import (
@@ -567,7 +568,17 @@ def read_unsized_from_stream(
567568
break
568569
stream.seek(-1, 1)
569570
try:
570-
key = read_object(stream, pdf)
571+
try:
572+
key = read_object(stream, pdf)
573+
if not isinstance(key, NameObject):
574+
raise PdfReadError(
575+
f"Expecting a NameObject for key but found {key!r}"
576+
)
577+
except PdfReadError as exc:
578+
if pdf is not None and pdf.strict:
579+
raise
580+
logger_warning(exc.__repr__(), __name__)
581+
continue
571582
tok = read_non_whitespace(stream)
572583
stream.seek(-1, 1)
573584
value = read_object(stream, pdf, forced_encoding)
@@ -1443,9 +1454,13 @@ def read_object(
14431454
else:
14441455
return NumberObject.read_from_stream(stream)
14451456
else:
1457+
pos = stream.tell()
14461458
stream.seek(-20, 1)
1459+
stream_extract = stream.read(80)
1460+
stream.seek(pos)
1461+
read_until_whitespace(stream)
14471462
raise PdfReadError(
1448-
f"Invalid Elementary Object starting with {tok!r} @{stream.tell()}: {stream.read(80).__repr__()}"
1463+
f"Invalid Elementary Object starting with {tok!r} @{pos}: {stream_extract!r}"
14491464
)
14501465

14511466

tests/test_reader.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1657,3 +1657,49 @@ def test_comments_in_array(caplog):
16571657
reader.stream = BytesIO(b[:1149])
16581658
with pytest.raises(PdfStreamError):
16591659
reader.pages[0]
1660+
1661+
1662+
@pytest.mark.enable_socket()
1663+
def test_space_in_names_to_continue_processing(caplog):
1664+
"""
1665+
This deals with space not encoded in names inducing errors.
1666+
Also covers case where NameObject not met for key.
1667+
"""
1668+
url = "https://github.com/user-attachments/files/17095516/crash-e108c4f677040b61e12fa9f1cfde025d704c9b0d.pdf"
1669+
name = "iss2866.pdf" # reused
1670+
b = get_data_from_url(url, name=name)
1671+
reader = PdfReader(BytesIO(b))
1672+
obj = reader.get_object(70)
1673+
assert all(
1674+
x in obj
1675+
for x in (
1676+
"/BaseFont",
1677+
"/DescendantFonts",
1678+
"/Encoding",
1679+
"/Subtype",
1680+
"/ToUnicode",
1681+
"/Type",
1682+
)
1683+
)
1684+
assert obj["/BaseFont"] == "/AASGAA+Arial,Unicode" # MS is missing to meet spec
1685+
assert 'PdfReadError("Invalid Elementary Object starting with' in caplog.text
1686+
1687+
caplog.clear()
1688+
1689+
b = b[:264] + b"(Inv) /d " + b[273:]
1690+
reader = PdfReader(BytesIO(b))
1691+
obj = reader.get_object(70)
1692+
assert all(
1693+
x in obj
1694+
for x in ["/DescendantFonts", "/Encoding", "/Subtype", "/ToUnicode", "/Type"]
1695+
)
1696+
assert all(
1697+
x in caplog.text
1698+
for x in (
1699+
"Expecting a NameObject for key but",
1700+
'PdfReadError("Invalid Elementary Object starting with',
1701+
)
1702+
)
1703+
reader = PdfReader(BytesIO(b), strict=True)
1704+
with pytest.raises(PdfReadError):
1705+
obj = reader.get_object(70)

0 commit comments

Comments
 (0)