Skip to content

Commit

Permalink
BUG: Handle indirect objects in font width calculations (#2967)
Browse files Browse the repository at this point in the history
Closes #2966.

---------

Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
  • Loading branch information
nsw42 and stefan6419846 authored Nov 29, 2024
1 parent db460c0 commit 1383234
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 8 deletions.
14 changes: 8 additions & 6 deletions pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,17 +396,17 @@ def build_font_width_map(
st: int = 0
en: int = 0
try:
default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object)] * 2.0
default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object())] * 2.0
except KeyError:
pass
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
# §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
# Widths for a CIDFont are defined using the DW and W entries.
# DW2 and W2 are for vertical use. Vertical type is not implemented.
ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
try:
font_width_map["default"] = cast(float, ft1["/DW"])
except Exception:
if "/DW" in ft1:
font_width_map["default"] = cast(float, ft1["/DW"].get_object())
else:
font_width_map["default"] = default_font_width
if "/W" in ft1:
w = ft1["/W"].get_object()
Expand All @@ -418,13 +418,15 @@ def build_font_width_map(
if isinstance(second, int):
# C_first C_last same_W
en = second
width = w[2].get_object()
for c_code in range(st, en + 1):
font_width_map[chr(c_code)] = w[2]
font_width_map[chr(c_code)] = width
w = w[3:]
elif isinstance(second, list):
# Starting_C [W1 W2 ... Wn]
c_code = st
for width in second:
for ww in second:
width = ww.get_object()
font_width_map[chr(c_code)] = width
c_code += 1
w = w[2:]
Expand Down
4 changes: 2 additions & 2 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1750,7 +1750,7 @@ def _debug_for_extract(self) -> str: # pragma: no cover
out += "No Font\n"
return out

def _get_acutual_font_widths(
def _get_actual_font_widths(
self,
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
Expand Down Expand Up @@ -1817,7 +1817,7 @@ def _handle_tj(
rtl_dir,
visitor_text)
font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
self._get_acutual_font_widths(cmap, text_operands, font_size, space_width))
self._get_actual_font_widths(cmap, text_operands, font_size, space_width))
actual_str_size["str_widths"] += font_widths

return text, rtl_dir, actual_str_size
Expand Down
12 changes: 12 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,3 +269,15 @@ def test_iss2925():
name = "iss2925.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "slicing on the PDG to extract the relevant contextual" in reader.pages[3].extract_text()


@pytest.mark.enable_socket
def test_iss2966():
"""Regression test for issue #2966: indirect objects in fonts"""
url = (
"https://github.com/user-attachments/files/17904233/repro_out.pdf"
)
name = "iss2966.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "Lorem ipsum dolor sit amet" in reader.pages[0].extract_text()

0 comments on commit 1383234

Please sign in to comment.