BUG: Avoid extracting inline images twice and dropping other operators (

#3002)
py-pdf · Dec 18, 2024 · ec982d2 · ec982d2
1 parent 434d09d
commit ec982d2
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 2 deletions.
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -765,7 +765,7 @@ def _get_inline_images(self) -> Dict[str, ImageFile]:
                 )
             elif ope in (b"BI", b"EI", b"ID"):  # pragma: no cover
                 raise PdfReadError(
-                    f"{ope!r} operator met whereas not expected,"
+                    f"{ope!r} operator met whereas not expected, "
                     "please share usecase with pypdf dev team"
                 )
             """backup

diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -1361,6 +1361,7 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
                 data = stream.read(
                     ceil(cast(int, settings["/W"]) * lcs) * cast(int, settings["/H"])
                 )
+            # Move to the `EI` if possible.
             ei = read_non_whitespace(stream)
             stream.seek(-1, 1)
         else:
@@ -1369,6 +1370,7 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
         ei = stream.read(3)
         stream.seek(-1, 1)
         if ei[0:2] != b"EI" or ei[2:3] not in WHITESPACES:
+            # Deal with wrong/missing `EI` tags.
             stream.seek(savpos, 0)
             data = extract_inline_default(stream)
         return {"settings": settings, "data": data}

diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py
@@ -224,11 +224,12 @@ def extract_inline_default(stream: StreamType) -> bytes:
             if data_buffered[pos_ei - 1 : pos_ei] not in WHITESPACES and tok3 not in {
                 b"Q",
                 b"E",
-            }:  # for Q ou EMC
+            }:  # for Q or EMC
                 stream.seek(saved_pos, 0)
                 continue
             # Data contains [\s]EI[\s](Q|EMC): 4 chars are sufficients
             # remove E(I) wrongly inserted earlier
+            stream.seek(saved_pos - 1, 0)
             stream_out.truncate(sav_pos_ei)
             break
 

diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -2542,3 +2542,40 @@ def test_deprecate_with_as():
         with pytest.warns(DeprecationWarning) as w:
             writer.with_as_usage = val  # old code allowed setting this, so...
         assert "with_as_usage is deprecated" in w[0].message.args[0]
+
+
+@pytest.mark.skipif(GHOSTSCRIPT_BINARY is None, reason="Requires Ghostscript")
+@pytest.mark.enable_socket
+def test_inline_image_q_operator_handling(tmp_path):
+    """Test for #2927"""
+    pdf_url = "https://github.com/user-attachments/files/17614880/test_clean.pdf"
+    pdf_name = "iss2927.pdf"
+    pdf_data = BytesIO(get_data_from_url(pdf_url, name=pdf_name))
+
+    png_url = "https://github.com/user-attachments/assets/abe16f48-9afa-4179-b1e8-62be27b95c26"
+    png_name = "iss2927.png"
+    expected_png_path = tmp_path / "expected.png"
+    expected_png_path.write_bytes(get_data_from_url(png_url, name=png_name))
+
+    writer = PdfWriter()
+    writer.append(pdf_data)
+    for page in writer.pages:
+        page.transfer_rotation_to_content()
+
+    pdf_path = tmp_path / "out.pdf"
+    png_path = tmp_path / "actual.png"
+
+    writer.write(pdf_path)
+    # False positive: https://github.com/PyCQA/bandit/issues/333
+    subprocess.run(  # noqa: S603
+        [
+            GHOSTSCRIPT_BINARY,
+            "-r120",
+            "-sDEVICE=pngalpha",
+            "-o",
+            png_path,
+            pdf_path,
+        ]
+    )
+    assert png_path.is_file()
+    assert image_similarity(png_path, expected_png_path) >= 0.999999