Skip to content

Commit

Permalink
BUG: Avoid extracting inline images twice and dropping other operators (
Browse files Browse the repository at this point in the history
  • Loading branch information
stefan6419846 authored Dec 18, 2024
1 parent 434d09d commit ec982d2
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,7 @@ def _get_inline_images(self) -> Dict[str, ImageFile]:
)
elif ope in (b"BI", b"EI", b"ID"): # pragma: no cover
raise PdfReadError(
f"{ope!r} operator met whereas not expected,"
f"{ope!r} operator met whereas not expected, "
"please share usecase with pypdf dev team"
)
"""backup
Expand Down
2 changes: 2 additions & 0 deletions pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -1361,6 +1361,7 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
data = stream.read(
ceil(cast(int, settings["/W"]) * lcs) * cast(int, settings["/H"])
)
# Move to the `EI` if possible.
ei = read_non_whitespace(stream)
stream.seek(-1, 1)
else:
Expand All @@ -1369,6 +1370,7 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
ei = stream.read(3)
stream.seek(-1, 1)
if ei[0:2] != b"EI" or ei[2:3] not in WHITESPACES:
# Deal with wrong/missing `EI` tags.
stream.seek(savpos, 0)
data = extract_inline_default(stream)
return {"settings": settings, "data": data}
Expand Down
3 changes: 2 additions & 1 deletion pypdf/generic/_image_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,11 +224,12 @@ def extract_inline_default(stream: StreamType) -> bytes:
if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in {
b"Q",
b"E",
}: # for Q ou EMC
}: # for Q or EMC
stream.seek(saved_pos, 0)
continue
# Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficients
# remove E(I) wrongly inserted earlier
stream.seek(saved_pos - 1, 0)
stream_out.truncate(sav_pos_ei)
break

Expand Down
37 changes: 37 additions & 0 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2542,3 +2542,40 @@ def test_deprecate_with_as():
with pytest.warns(DeprecationWarning) as w:
writer.with_as_usage = val # old code allowed setting this, so...
assert "with_as_usage is deprecated" in w[0].message.args[0]


@pytest.mark.skipif(GHOSTSCRIPT_BINARY is None, reason="Requires Ghostscript")
@pytest.mark.enable_socket
def test_inline_image_q_operator_handling(tmp_path):
"""Test for #2927"""
pdf_url = "https://github.com/user-attachments/files/17614880/test_clean.pdf"
pdf_name = "iss2927.pdf"
pdf_data = BytesIO(get_data_from_url(pdf_url, name=pdf_name))

png_url = "https://github.com/user-attachments/assets/abe16f48-9afa-4179-b1e8-62be27b95c26"
png_name = "iss2927.png"
expected_png_path = tmp_path / "expected.png"
expected_png_path.write_bytes(get_data_from_url(png_url, name=png_name))

writer = PdfWriter()
writer.append(pdf_data)
for page in writer.pages:
page.transfer_rotation_to_content()

pdf_path = tmp_path / "out.pdf"
png_path = tmp_path / "actual.png"

writer.write(pdf_path)
# False positive: https://github.com/PyCQA/bandit/issues/333
subprocess.run( # noqa: S603
[
GHOSTSCRIPT_BINARY,
"-r120",
"-sDEVICE=pngalpha",
"-o",
png_path,
pdf_path,
]
)
assert png_path.is_file()
assert image_similarity(png_path, expected_png_path) >= 0.999999

0 comments on commit ec982d2

Please sign in to comment.