From 7c5fe87ef63b85823fc44c9f21cbb055d47be181 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Thu, 19 Dec 2024 19:29:18 +0100 Subject: [PATCH 1/3] ROB: Ignore odd-length strings when processing cmap lines (#3009) Closes #2216. --- pypdf/_cmap.py | 6 +++++- tests/test_cmap.py | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index e6c4bce88..de21b3429 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -1,3 +1,4 @@ +import binascii from binascii import unhexlify from math import ceil from typing import Any, Dict, List, Tuple, Union, cast @@ -304,7 +305,10 @@ def process_cm_line( elif b"endbfchar" in line: process_char = False elif process_rg: - multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg) + try: + multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg) + except binascii.Error as error: + logger_warning(f"Skipping broken line {line!r}: {error}", __name__) elif process_char: parse_bfchar(line, map_dict, int_entry) return process_rg, process_char, multiline_rg diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 55df3f1f2..e80842460 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -281,3 +281,14 @@ def test_iss2966(): reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert "Lorem ipsum dolor sit amet" in reader.pages[0].extract_text() + +@pytest.mark.enable_socket +def test_binascii_odd_length_string(caplog): + """Tests for #2216""" + url = "https://github.com/user-attachments/files/18199642/iss2216.pdf" + name = "iss2216.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + + page = reader.pages[0] + assert "\n(Many other theorems may\n" in page.extract_text() + assert "Skipping broken line b'143f 143f 10300': Odd-length string\n" in caplog.text From 83083bb15596af05fe9564aaf5e937a8f293e0db Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Thu, 19 Dec 2024 20:11:14 +0100 Subject: [PATCH 2/3] BUG: Handle chained colorspace for inline images when no filter is set (#3008) Closes #2998. --- pypdf/generic/_data_structures.py | 2 ++ tests/test_images.py | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 3313ab8da..f02fe4988 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -1343,6 +1343,8 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]: data = extract_inline_DCT(stream) elif filtr == "not set": cs = settings.get("/CS", "") + if isinstance(cs, list): + cs = cs[0] if "RGB" in cs: lcs = 3 elif "CMYK" in cs: diff --git a/tests/test_images.py b/tests/test_images.py index c0308eb3e..7d415f6f3 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -473,3 +473,14 @@ def test_4bits_images(caplog): name = "iss2411.png" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[0].images[1].image, img) == 1.0 + + +@pytest.mark.enable_socket +def test_no_filter_with_colorspace_as_list(): + """Tests for #2998""" + url = "https://github.com/user-attachments/files/18058571/9bf7a2e2-72c8-4ac1-b8ae-164df16c8cef.pdf" + name = "iss2998.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + + page = reader.pages[0] + page.images.items() From 4f2cd3439c6f074c515fe347ef92ba0bb44a9e37 Mon Sep 17 00:00:00 2001 From: Stefan <96178532+stefan6419846@users.noreply.github.com> Date: Thu, 19 Dec 2024 20:15:27 +0100 Subject: [PATCH 3/3] ROB: Fall back to non-Adobe Ascii85 format for missing end markers (#3007) Closes #2996. --- pypdf/filters.py | 8 +++++++- tests/test_filters.py | 26 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/pypdf/filters.py b/pypdf/filters.py index 517d6aac3..a95b96a54 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -446,7 +446,13 @@ def decode( if isinstance(data, str): data = data.encode() data = data.strip(WHITESPACES_AS_BYTES) - return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES) + try: + return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES) + except ValueError as error: + if error.args[0] == "Ascii85 encoded byte sequences must end with b'~>'": + logger_warning("Ignoring missing Ascii85 end marker.", __name__) + return a85decode(data, adobe=False, ignorechars=WHITESPACES_AS_BYTES) + raise class DCTDecode: diff --git a/tests/test_filters.py b/tests/test_filters.py index 23b90cca8..90a119844 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -594,3 +594,29 @@ def test_flate_decode_with_image_mode_1__whitespace_at_end_of_lookup(): name = "issue2331.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) reader.pages[0].images[0] + + +@pytest.mark.enable_socket +def test_ascii85decode__invalid_end__recoverable(caplog): + """From #2996""" + url = "https://github.com/user-attachments/files/18050808/1af7d56a-5c8c-4914-85b3-b2536a5525cd.pdf" + name = "issue2996.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + + page = reader.pages[1] + assert page.extract_text() == "" + assert "Ignoring missing Ascii85 end marker." in caplog.text + + +def test_ascii85decode__non_recoverable(caplog): + # Without our custom handling, this would complain about the final `~>` being missing. + data = "äöüß" + with pytest.raises(ValueError, match="Non-Ascii85 digit found: Ã"): + ASCII85Decode.decode(data) + assert "Ignoring missing Ascii85 end marker." in caplog.text + caplog.clear() + + data += "~>" + with pytest.raises(ValueError, match="Non-Ascii85 digit found: Ã"): + ASCII85Decode.decode(data) + assert caplog.text == ""