Skip to content

Commit

Permalink
ROB: Gracefully handle some text operators when the operands are missing
Browse files Browse the repository at this point in the history
Closes #2975.
  • Loading branch information
stefan6419846 committed Dec 19, 2024
1 parent 17f6e35 commit 4804852
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 4 deletions.
17 changes: 13 additions & 4 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1980,12 +1980,21 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
memo_tm = tm_matrix.copy()
# Table 5.2 page 398
elif operator == b"Tz":
char_scale = float(operands[0]) / 100.0
if operands:
char_scale = float(operands[0]) / 100.0
else:
char_scale = 1.0
elif operator == b"Tw":
space_scale = 1.0 + float(operands[0])
if operands:
space_scale = 1.0 + float(operands[0])
else:
space_scale = 1.0
elif operator == b"TL":
scale_x = math.sqrt(tm_matrix[0]**2 + tm_matrix[2]**2)
TL = float(operands[0]) * font_size * scale_x
if operands:
scale_x = math.sqrt(tm_matrix[0]**2 + tm_matrix[2]**2)
TL = float(operands[0]) * font_size * scale_x
else:
TL = 0.0
elif operator == b"Tf":
if text != "":
output += text # .translate(cmap)
Expand Down
12 changes: 12 additions & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,3 +272,15 @@ def test_infinite_loop_arrays():
page = reader.pages[0]
extracted = page.extract_text()
assert "RNA structure comparison" in extracted


@pytest.mark.enable_socket
def test_tz_with_no_operands():
"""Tests for #2975"""
url = "https://github.com/user-attachments/files/17974120/9E5E080E-C8DB-4A6B-822B-9A67DC04E526-120438.pdf"
name = "iss2975.pdf"
data = get_data_from_url(url, name=name)

reader = PdfReader(BytesIO(data))
page = reader.pages[1]
assert "\nThankyouforyourattentiontothismatter.\n" in page.extract_text()

0 comments on commit 4804852

Please sign in to comment.