ROB: Fix extract_text() issues on damaged PDFs (#2760)

Closes #2702.
py-pdf · Aug 13, 2024 · 2eb565d · 2eb565d
1 parent cf7fcfd
commit 2eb565d
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 2 deletions.
diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
@@ -258,7 +258,7 @@ def prepare_cm(ft: DictionaryObject) -> bytes:
     cm: bytes
     if isinstance(tu, StreamObject):
         cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
-    elif isinstance(tu, str) and tu.startswith("/Identity"):
+    else:  # if (tu is None) or cast(str, tu).startswith("/Identity"):
         # the full range 0000-FFFF will be processed
         cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
     if isinstance(cm, str):

diff --git a/tests/test_cmap.py b/tests/test_cmap.py
@@ -6,7 +6,7 @@
 
 from pypdf import PdfReader, PdfWriter
 from pypdf._cmap import build_char_map
-from pypdf.generic import ArrayObject, NameObject, NullObject
+from pypdf.generic import ArrayObject, IndirectObject, NameObject, NullObject
 
 from . import get_data_from_url
 
@@ -214,6 +214,22 @@ def test_eten_b5():
     reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險")
 
 
+@pytest.mark.enable_socket()
+def test_missing_entries_in_cmap():
+    """
+    Issue #2702: this issue is observed on damaged pdfs
+    use of this file in test has been discarded as too slow/long
+    we will create the same error from crazyones
+    """
+    pdf_path = RESOURCE_ROOT / "crazyones.pdf"
+    reader = PdfReader(pdf_path)
+    p = reader.pages[0]
+    p["/Resources"]["/Font"]["/F1"][NameObject("/ToUnicode")] = IndirectObject(
+        99999999, 0, reader
+    )
+    p.extract_text()
+
+
 def test_null_missing_width():
     """For coverage of 2792"""
     writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")