ROB: Continue parsing dictionary object when error is detected (#2872)

pubpub-zz · web-flow · commit 762fc1f6cd1e · 2024-09-27T10:40:23.000+02:00
Closes #2866.
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -57,6 +57,7 @@
     logger_warning,
     read_non_whitespace,
     read_until_regex,
+    read_until_whitespace,
     skip_over_comment,
 )
 from ..constants import (
@@ -567,7 +568,17 @@ def read_unsized_from_stream(
                 break
             stream.seek(-1, 1)
             try:
-                key = read_object(stream, pdf)
+                try:
+                    key = read_object(stream, pdf)
+                    if not isinstance(key, NameObject):
+                        raise PdfReadError(
+                            f"Expecting a NameObject for key but found {key!r}"
+                        )
+                except PdfReadError as exc:
+                    if pdf is not None and pdf.strict:
+                        raise
+                    logger_warning(exc.__repr__(), __name__)
+                    continue
                 tok = read_non_whitespace(stream)
                 stream.seek(-1, 1)
                 value = read_object(stream, pdf, forced_encoding)
@@ -1443,9 +1454,13 @@ def read_object(
         else:
             return NumberObject.read_from_stream(stream)
     else:
+        pos = stream.tell()
         stream.seek(-20, 1)
+        stream_extract = stream.read(80)
+        stream.seek(pos)
+        read_until_whitespace(stream)
         raise PdfReadError(
-            f"Invalid Elementary Object starting with {tok!r} @{stream.tell()}: {stream.read(80).__repr__()}"
+            f"Invalid Elementary Object starting with {tok!r} @{pos}: {stream_extract!r}"
         )
 
 
diff --git a/tests/test_reader.py b/tests/test_reader.py
@@ -1657,3 +1657,49 @@ def test_comments_in_array(caplog):
     reader.stream = BytesIO(b[:1149])
     with pytest.raises(PdfStreamError):
         reader.pages[0]
+
+
+@pytest.mark.enable_socket()
+def test_space_in_names_to_continue_processing(caplog):
+    """
+    This deals with space not encoded in names inducing errors.
+    Also covers case where NameObject not met for key.
+    """
+    url = "https://github.com/user-attachments/files/17095516/crash-e108c4f677040b61e12fa9f1cfde025d704c9b0d.pdf"
+    name = "iss2866.pdf"  # reused
+    b = get_data_from_url(url, name=name)
+    reader = PdfReader(BytesIO(b))
+    obj = reader.get_object(70)
+    assert all(
+        x in obj
+        for x in (
+            "/BaseFont",
+            "/DescendantFonts",
+            "/Encoding",
+            "/Subtype",
+            "/ToUnicode",
+            "/Type",
+        )
+    )
+    assert obj["/BaseFont"] == "/AASGAA+Arial,Unicode"  # MS is missing to meet spec
+    assert 'PdfReadError("Invalid Elementary Object starting with' in caplog.text
+
+    caplog.clear()
+
+    b = b[:264] + b"(Inv) /d " + b[273:]
+    reader = PdfReader(BytesIO(b))
+    obj = reader.get_object(70)
+    assert all(
+        x in obj
+        for x in ["/DescendantFonts", "/Encoding", "/Subtype", "/ToUnicode", "/Type"]
+    )
+    assert all(
+        x in caplog.text
+        for x in (
+            "Expecting a NameObject for key but",
+            'PdfReadError("Invalid Elementary Object starting with',
+        )
+    )
+    reader = PdfReader(BytesIO(b), strict=True)
+    with pytest.raises(PdfReadError):
+        obj = reader.get_object(70)