ROB: Gracefully handle some text operators when the operands are missing

Closes #2975.
py-pdf · Dec 19, 2024 · 4804852 · 4804852
1 parent 17f6e35
commit 4804852
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 4 deletions.
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -1980,12 +1980,21 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
                 memo_tm = tm_matrix.copy()
             # Table 5.2 page 398
             elif operator == b"Tz":
-                char_scale = float(operands[0]) / 100.0
+                if operands:
+                    char_scale = float(operands[0]) / 100.0
+                else:
+                    char_scale = 1.0
             elif operator == b"Tw":
-                space_scale = 1.0 + float(operands[0])
+                if operands:
+                    space_scale = 1.0 + float(operands[0])
+                else:
+                    space_scale = 1.0
             elif operator == b"TL":
-                scale_x = math.sqrt(tm_matrix[0]**2 + tm_matrix[2]**2)
-                TL = float(operands[0]) * font_size * scale_x
+                if operands:
+                    scale_x = math.sqrt(tm_matrix[0]**2 + tm_matrix[2]**2)
+                    TL = float(operands[0]) * font_size * scale_x
+                else:
+                    TL = 0.0
             elif operator == b"Tf":
                 if text != "":
                     output += text  # .translate(cmap)

diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
@@ -272,3 +272,15 @@ def test_infinite_loop_arrays():
     page = reader.pages[0]
     extracted = page.extract_text()
     assert "RNA structure comparison" in extracted
+
+
+@pytest.mark.enable_socket
+def test_tz_with_no_operands():
+    """Tests for #2975"""
+    url = "https://github.com/user-attachments/files/17974120/9E5E080E-C8DB-4A6B-822B-9A67DC04E526-120438.pdf"
+    name = "iss2975.pdf"
+    data = get_data_from_url(url, name=name)
+
+    reader = PdfReader(BytesIO(data))
+    page = reader.pages[1]
+    assert "\nThankyouforyourattentiontothismatter.\n" in page.extract_text()