Skip to content

Commit

Permalink
Remove invalid hyperlink annotations to satisfy Ghostscript 10.x duri…
Browse files Browse the repository at this point in the history
…ng PDF/A conversion

Closes #1425
  • Loading branch information
jbarlow83 committed Nov 17, 2024
1 parent 08f95c0 commit a659f83
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 1 deletion.
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,6 @@ convention = "google"

[tool.ruff.format]
quote-style = "preserve"

[dependency-groups]
dev = ["mypy>=1.13.0"]
66 changes: 66 additions & 0 deletions src/ocrmypdf/_annots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""OCRmyPDF PDF annotation cleanup."""

from __future__ import annotations

import logging

from pikepdf import Dictionary, Name, NameTree, Pdf

log = logging.getLogger(__name__)


def remove_broken_goto_annotations(pdf: Pdf) -> bool:
"""Remove broken goto annotations from a PDF.
If a PDF contains a GoTo Action that points to a named destination that does not
exist, Ghostscript PDF/A conversion will fail. In any event, a named destination
that is not defined is not useful.
Args:
pdf: Opened PDF file.
Returns:
bool: True if the file was modified, False if not.
"""
modified = False

# Check if there are any named destinations
if Name.Names not in pdf.Root:
return modified
if Name.Dests not in pdf.Root[Name.Names]:
return modified

dests = pdf.Root[Name.Names][Name.Dests]
if not isinstance(dests, Dictionary):
return modified
nametree = NameTree(dests)

# Create a set of all named destinations
names = set(k for k in nametree.keys())

for n, page in enumerate(pdf.pages):
if Name.Annots not in page:
continue
for annot in page[Name.Annots]:
if not isinstance(annot, Dictionary):
continue
if Name.A not in annot or Name.D not in annot[Name.A]:
continue
# We found an annotation that points to a named destination
named_destination = str(annot[Name.A][Name.D])
if named_destination not in names:
# If there is no corresponding named destination, remove the
# annotation. Having no destination set is still valid and just
# makes the link non-functional.
log.warning(
f"Disabling a hyperlink annotation on page {n + 1} to a "
"non-existent named destination "
f"{named_destination}."
)
del annot[Name.A][Name.D]
modified = True

return modified
1 change: 1 addition & 0 deletions src/ocrmypdf/_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from pikepdf import __version__ as PIKEPDF_VERSION
from pikepdf.models.metadata import PdfMetadata, encode_pdf_date

from ocrmypdf._annots import remove_broken_goto_annotations
from ocrmypdf._defaults import PROGRAM_NAME
from ocrmypdf._jobcontext import PdfContext
from ocrmypdf._version import __version__ as OCRMYPF_VERSION
Expand Down
11 changes: 10 additions & 1 deletion src/ocrmypdf/_pipelines/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
from typing import NamedTuple, cast

import PIL
from pikepdf import Pdf

from ocrmypdf._annots import remove_broken_goto_annotations
from ocrmypdf._concurrent import Executor, setup_executor
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._logging import PageNumberFilter
Expand Down Expand Up @@ -438,7 +440,14 @@ def postprocess(
pdf_file: Path, context: PdfContext, executor: Executor
) -> tuple[Path, Sequence[str]]:
"""Postprocess the PDF file."""
pdf_out = pdf_file
# pdf_out = pdf_file
with Pdf.open(pdf_file) as pdf:
fix_annots = context.get_path('fix_annots.pdf')
if remove_broken_goto_annotations(pdf):
pdf.save(fix_annots)
pdf_out = fix_annots
else:
pdf_out = pdf_file
if context.options.output_type.startswith('pdfa'):
ps_stub_out = generate_postscript_stub(context)
pdf_out = convert_to_pdfa(pdf_out, ps_stub_out, context)
Expand Down
31 changes: 31 additions & 0 deletions tests/test_annots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

from __future__ import annotations

import pytest
from pikepdf import Array, Dictionary, Name, NameTree, Pdf

from ocrmypdf._annots import remove_broken_goto_annotations


def test_remove_broken_goto_annotations(resources):
with Pdf.open(resources / 'link.pdf') as pdf:
assert not remove_broken_goto_annotations(pdf), "File should not be modified"

# Construct Dests nametree
nt = NameTree.new(pdf)
names = pdf.Root[Name.Names] = pdf.make_indirect(Dictionary())
names[Name.Dests] = nt.obj
# Create a broken named destination
nt['Invalid'] = pdf.make_indirect(Dictionary())
# Create a valid named destination
nt['Valid'] = Array([pdf.pages[0].obj, Name.XYZ, 0, 0, 0])

pdf.pages[0].Annots[0].A.D = 'Missing'
pdf.pages[1].Annots[0].A.D = 'Valid'

assert remove_broken_goto_annotations(pdf), "File should be modified"

assert Name.D not in pdf.pages[0].Annots[0].A
assert Name.D in pdf.pages[1].Annots[0].A
2 changes: 2 additions & 0 deletions tests/test_page_reducing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
import hypothesis
import pytest

0 comments on commit a659f83

Please sign in to comment.