Skip to content

Commit

Permalink
ROB: Fix extract_text() issues on damaged PDFs (#2760)
Browse files Browse the repository at this point in the history
Closes #2702.
  • Loading branch information
pubpub-zz authored Aug 13, 2024
1 parent cf7fcfd commit 2eb565d
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def prepare_cm(ft: DictionaryObject) -> bytes:
cm: bytes
if isinstance(tu, StreamObject):
cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
elif isinstance(tu, str) and tu.startswith("/Identity"):
else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
# the full range 0000-FFFF will be processed
cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
if isinstance(cm, str):
Expand Down
18 changes: 17 additions & 1 deletion tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from pypdf import PdfReader, PdfWriter
from pypdf._cmap import build_char_map
from pypdf.generic import ArrayObject, NameObject, NullObject
from pypdf.generic import ArrayObject, IndirectObject, NameObject, NullObject

from . import get_data_from_url

Expand Down Expand Up @@ -214,6 +214,22 @@ def test_eten_b5():
reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險")


@pytest.mark.enable_socket()
def test_missing_entries_in_cmap():
"""
Issue #2702: this issue is observed on damaged pdfs
use of this file in test has been discarded as too slow/long
we will create the same error from crazyones
"""
pdf_path = RESOURCE_ROOT / "crazyones.pdf"
reader = PdfReader(pdf_path)
p = reader.pages[0]
p["/Resources"]["/Font"]["/F1"][NameObject("/ToUnicode")] = IndirectObject(
99999999, 0, reader
)
p.extract_text()


def test_null_missing_width():
"""For coverage of 2792"""
writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
Expand Down

0 comments on commit 2eb565d

Please sign in to comment.