Skip to content

Commit

Permalink
Merge branch 'main' into issue2975
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz authored Dec 19, 2024
2 parents 9763d9a + 4f2cd34 commit 358f1e3
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 2 deletions.
6 changes: 5 additions & 1 deletion pypdf/_cmap.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import binascii
from binascii import unhexlify
from math import ceil
from typing import Any, Dict, List, Tuple, Union, cast
Expand Down Expand Up @@ -304,7 +305,10 @@ def process_cm_line(
elif b"endbfchar" in line:
process_char = False
elif process_rg:
multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
try:
multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
except binascii.Error as error:
logger_warning(f"Skipping broken line {line!r}: {error}", __name__)
elif process_char:
parse_bfchar(line, map_dict, int_entry)
return process_rg, process_char, multiline_rg
Expand Down
8 changes: 7 additions & 1 deletion pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,13 @@ def decode(
if isinstance(data, str):
data = data.encode()
data = data.strip(WHITESPACES_AS_BYTES)
return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES)
try:
return a85decode(data, adobe=True, ignorechars=WHITESPACES_AS_BYTES)
except ValueError as error:
if error.args[0] == "Ascii85 encoded byte sequences must end with b'~>'":
logger_warning("Ignoring missing Ascii85 end marker.", __name__)
return a85decode(data, adobe=False, ignorechars=WHITESPACES_AS_BYTES)
raise


class DCTDecode:
Expand Down
2 changes: 2 additions & 0 deletions pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -1343,6 +1343,8 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
data = extract_inline_DCT(stream)
elif filtr == "not set":
cs = settings.get("/CS", "")
if isinstance(cs, list):
cs = cs[0]
if "RGB" in cs:
lcs = 3
elif "CMYK" in cs:
Expand Down
11 changes: 11 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,3 +281,14 @@ def test_iss2966():
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "Lorem ipsum dolor sit amet" in reader.pages[0].extract_text()


@pytest.mark.enable_socket
def test_binascii_odd_length_string(caplog):
"""Tests for #2216"""
url = "https://github.com/user-attachments/files/18199642/iss2216.pdf"
name = "iss2216.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))

page = reader.pages[0]
assert "\n(Many other theorems may\n" in page.extract_text()
assert "Skipping broken line b'143f 143f 10300': Odd-length string\n" in caplog.text
26 changes: 26 additions & 0 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,3 +594,29 @@ def test_flate_decode_with_image_mode_1__whitespace_at_end_of_lookup():
name = "issue2331.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
reader.pages[0].images[0]


@pytest.mark.enable_socket
def test_ascii85decode__invalid_end__recoverable(caplog):
"""From #2996"""
url = "https://github.com/user-attachments/files/18050808/1af7d56a-5c8c-4914-85b3-b2536a5525cd.pdf"
name = "issue2996.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))

page = reader.pages[1]
assert page.extract_text() == ""
assert "Ignoring missing Ascii85 end marker." in caplog.text


def test_ascii85decode__non_recoverable(caplog):
# Without our custom handling, this would complain about the final `~>` being missing.
data = "äöüß"
with pytest.raises(ValueError, match="Non-Ascii85 digit found: Ã"):
ASCII85Decode.decode(data)
assert "Ignoring missing Ascii85 end marker." in caplog.text
caplog.clear()

data += "~>"
with pytest.raises(ValueError, match="Non-Ascii85 digit found: Ã"):
ASCII85Decode.decode(data)
assert caplog.text == ""
11 changes: 11 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,3 +473,14 @@ def test_4bits_images(caplog):
name = "iss2411.png"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
assert image_similarity(reader.pages[0].images[1].image, img) == 1.0


@pytest.mark.enable_socket
def test_no_filter_with_colorspace_as_list():
"""Tests for #2998"""
url = "https://github.com/user-attachments/files/18058571/9bf7a2e2-72c8-4ac1-b8ae-164df16c8cef.pdf"
name = "iss2998.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))

page = reader.pages[0]
page.images.items()

0 comments on commit 358f1e3

Please sign in to comment.