From a659f83d67bf9488d19864636aa35fb15ee6180e Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 16 Nov 2024 19:02:10 -0800 Subject: [PATCH] Remove invalid hyperlink annotations to satisfy Ghostscript 10.x during PDF/A conversion Closes #1425 --- pyproject.toml | 3 ++ src/ocrmypdf/_annots.py | 66 ++++++++++++++++++++++++++++++ src/ocrmypdf/_metadata.py | 1 + src/ocrmypdf/_pipelines/_common.py | 11 ++++- tests/test_annots.py | 31 ++++++++++++++ tests/test_page_reducing.py | 2 + 6 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 src/ocrmypdf/_annots.py create mode 100644 tests/test_annots.py create mode 100644 tests/test_page_reducing.py diff --git a/pyproject.toml b/pyproject.toml index feb6bbc70..4892fcf79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -155,3 +155,6 @@ convention = "google" [tool.ruff.format] quote-style = "preserve" + +[dependency-groups] +dev = ["mypy>=1.13.0"] diff --git a/src/ocrmypdf/_annots.py b/src/ocrmypdf/_annots.py new file mode 100644 index 000000000..ab321e7ba --- /dev/null +++ b/src/ocrmypdf/_annots.py @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: 2024 James R. Barlow +# SPDX-License-Identifier: MPL-2.0 + +"""OCRmyPDF PDF annotation cleanup.""" + +from __future__ import annotations + +import logging + +from pikepdf import Dictionary, Name, NameTree, Pdf + +log = logging.getLogger(__name__) + + +def remove_broken_goto_annotations(pdf: Pdf) -> bool: + """Remove broken goto annotations from a PDF. + + If a PDF contains a GoTo Action that points to a named destination that does not + exist, Ghostscript PDF/A conversion will fail. In any event, a named destination + that is not defined is not useful. + + Args: + pdf: Opened PDF file. + + Returns: + bool: True if the file was modified, False if not. + """ + modified = False + + # Check if there are any named destinations + if Name.Names not in pdf.Root: + return modified + if Name.Dests not in pdf.Root[Name.Names]: + return modified + + dests = pdf.Root[Name.Names][Name.Dests] + if not isinstance(dests, Dictionary): + return modified + nametree = NameTree(dests) + + # Create a set of all named destinations + names = set(k for k in nametree.keys()) + + for n, page in enumerate(pdf.pages): + if Name.Annots not in page: + continue + for annot in page[Name.Annots]: + if not isinstance(annot, Dictionary): + continue + if Name.A not in annot or Name.D not in annot[Name.A]: + continue + # We found an annotation that points to a named destination + named_destination = str(annot[Name.A][Name.D]) + if named_destination not in names: + # If there is no corresponding named destination, remove the + # annotation. Having no destination set is still valid and just + # makes the link non-functional. + log.warning( + f"Disabling a hyperlink annotation on page {n + 1} to a " + "non-existent named destination " + f"{named_destination}." + ) + del annot[Name.A][Name.D] + modified = True + + return modified diff --git a/src/ocrmypdf/_metadata.py b/src/ocrmypdf/_metadata.py index 2896968f5..5b9e03e46 100644 --- a/src/ocrmypdf/_metadata.py +++ b/src/ocrmypdf/_metadata.py @@ -15,6 +15,7 @@ from pikepdf import __version__ as PIKEPDF_VERSION from pikepdf.models.metadata import PdfMetadata, encode_pdf_date +from ocrmypdf._annots import remove_broken_goto_annotations from ocrmypdf._defaults import PROGRAM_NAME from ocrmypdf._jobcontext import PdfContext from ocrmypdf._version import __version__ as OCRMYPF_VERSION diff --git a/src/ocrmypdf/_pipelines/_common.py b/src/ocrmypdf/_pipelines/_common.py index 23d1df8f4..e5ba97258 100644 --- a/src/ocrmypdf/_pipelines/_common.py +++ b/src/ocrmypdf/_pipelines/_common.py @@ -20,7 +20,9 @@ from typing import NamedTuple, cast import PIL +from pikepdf import Pdf +from ocrmypdf._annots import remove_broken_goto_annotations from ocrmypdf._concurrent import Executor, setup_executor from ocrmypdf._jobcontext import PageContext, PdfContext from ocrmypdf._logging import PageNumberFilter @@ -438,7 +440,14 @@ def postprocess( pdf_file: Path, context: PdfContext, executor: Executor ) -> tuple[Path, Sequence[str]]: """Postprocess the PDF file.""" - pdf_out = pdf_file + # pdf_out = pdf_file + with Pdf.open(pdf_file) as pdf: + fix_annots = context.get_path('fix_annots.pdf') + if remove_broken_goto_annotations(pdf): + pdf.save(fix_annots) + pdf_out = fix_annots + else: + pdf_out = pdf_file if context.options.output_type.startswith('pdfa'): ps_stub_out = generate_postscript_stub(context) pdf_out = convert_to_pdfa(pdf_out, ps_stub_out, context) diff --git a/tests/test_annots.py b/tests/test_annots.py new file mode 100644 index 000000000..24287e963 --- /dev/null +++ b/tests/test_annots.py @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: 2024 James R. Barlow +# SPDX-License-Identifier: MPL-2.0 + +from __future__ import annotations + +import pytest +from pikepdf import Array, Dictionary, Name, NameTree, Pdf + +from ocrmypdf._annots import remove_broken_goto_annotations + + +def test_remove_broken_goto_annotations(resources): + with Pdf.open(resources / 'link.pdf') as pdf: + assert not remove_broken_goto_annotations(pdf), "File should not be modified" + + # Construct Dests nametree + nt = NameTree.new(pdf) + names = pdf.Root[Name.Names] = pdf.make_indirect(Dictionary()) + names[Name.Dests] = nt.obj + # Create a broken named destination + nt['Invalid'] = pdf.make_indirect(Dictionary()) + # Create a valid named destination + nt['Valid'] = Array([pdf.pages[0].obj, Name.XYZ, 0, 0, 0]) + + pdf.pages[0].Annots[0].A.D = 'Missing' + pdf.pages[1].Annots[0].A.D = 'Valid' + + assert remove_broken_goto_annotations(pdf), "File should be modified" + + assert Name.D not in pdf.pages[0].Annots[0].A + assert Name.D in pdf.pages[1].Annots[0].A diff --git a/tests/test_page_reducing.py b/tests/test_page_reducing.py new file mode 100644 index 000000000..8e0b4af40 --- /dev/null +++ b/tests/test_page_reducing.py @@ -0,0 +1,2 @@ +import hypothesis +import pytest