From 13832342639bf32a380137555d761e0deb723c14 Mon Sep 17 00:00:00 2001 From: nsw42 Date: Fri, 29 Nov 2024 13:32:50 +0000 Subject: [PATCH] BUG: Handle indirect objects in font width calculations (#2967) Closes #2966. --------- Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- pypdf/_cmap.py | 14 ++++++++------ pypdf/_page.py | 4 ++-- tests/test_cmap.py | 12 ++++++++++++ 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 54c54436e..e6c4bce88 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -396,7 +396,7 @@ def build_font_width_map( st: int = 0 en: int = 0 try: - default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object)] * 2.0 + default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object())] * 2.0 except KeyError: pass if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"): @@ -404,9 +404,9 @@ def build_font_width_map( # Widths for a CIDFont are defined using the DW and W entries. # DW2 and W2 are for vertical use. Vertical type is not implemented. ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore - try: - font_width_map["default"] = cast(float, ft1["/DW"]) - except Exception: + if "/DW" in ft1: + font_width_map["default"] = cast(float, ft1["/DW"].get_object()) + else: font_width_map["default"] = default_font_width if "/W" in ft1: w = ft1["/W"].get_object() @@ -418,13 +418,15 @@ def build_font_width_map( if isinstance(second, int): # C_first C_last same_W en = second + width = w[2].get_object() for c_code in range(st, en + 1): - font_width_map[chr(c_code)] = w[2] + font_width_map[chr(c_code)] = width w = w[3:] elif isinstance(second, list): # Starting_C [W1 W2 ... Wn] c_code = st - for width in second: + for ww in second: + width = ww.get_object() font_width_map[chr(c_code)] = width c_code += 1 w = w[2:] diff --git a/pypdf/_page.py b/pypdf/_page.py index d5de76f77..cbae35817 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1750,7 +1750,7 @@ def _debug_for_extract(self) -> str: # pragma: no cover out += "No Font\n" return out - def _get_acutual_font_widths( + def _get_actual_font_widths( self, cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] @@ -1817,7 +1817,7 @@ def _handle_tj( rtl_dir, visitor_text) font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = ( - self._get_acutual_font_widths(cmap, text_operands, font_size, space_width)) + self._get_actual_font_widths(cmap, text_operands, font_size, space_width)) actual_str_size["str_widths"] += font_widths return text, rtl_dir, actual_str_size diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 07a778520..55df3f1f2 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -269,3 +269,15 @@ def test_iss2925(): name = "iss2925.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) assert "slicing on the PDG to extract the relevant contextual" in reader.pages[3].extract_text() + + +@pytest.mark.enable_socket +def test_iss2966(): + """Regression test for issue #2966: indirect objects in fonts""" + url = ( + "https://github.com/user-attachments/files/17904233/repro_out.pdf" + ) + name = "iss2966.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert "Lorem ipsum dolor sit amet" in reader.pages[0].extract_text() +