-
-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Remove invalid hyperlink annotations to satisfy Ghostscript 10.x duri…
…ng PDF/A conversion Closes #1425
- Loading branch information
Showing
6 changed files
with
113 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -155,3 +155,6 @@ convention = "google" | |
|
||
[tool.ruff.format] | ||
quote-style = "preserve" | ||
|
||
[dependency-groups] | ||
dev = ["mypy>=1.13.0"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# SPDX-FileCopyrightText: 2024 James R. Barlow | ||
# SPDX-License-Identifier: MPL-2.0 | ||
|
||
"""OCRmyPDF PDF annotation cleanup.""" | ||
|
||
from __future__ import annotations | ||
|
||
import logging | ||
|
||
from pikepdf import Dictionary, Name, NameTree, Pdf | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
def remove_broken_goto_annotations(pdf: Pdf) -> bool: | ||
"""Remove broken goto annotations from a PDF. | ||
If a PDF contains a GoTo Action that points to a named destination that does not | ||
exist, Ghostscript PDF/A conversion will fail. In any event, a named destination | ||
that is not defined is not useful. | ||
Args: | ||
pdf: Opened PDF file. | ||
Returns: | ||
bool: True if the file was modified, False if not. | ||
""" | ||
modified = False | ||
|
||
# Check if there are any named destinations | ||
if Name.Names not in pdf.Root: | ||
return modified | ||
if Name.Dests not in pdf.Root[Name.Names]: | ||
return modified | ||
|
||
dests = pdf.Root[Name.Names][Name.Dests] | ||
if not isinstance(dests, Dictionary): | ||
return modified | ||
nametree = NameTree(dests) | ||
|
||
# Create a set of all named destinations | ||
names = set(k for k in nametree.keys()) | ||
|
||
for n, page in enumerate(pdf.pages): | ||
if Name.Annots not in page: | ||
continue | ||
for annot in page[Name.Annots]: | ||
if not isinstance(annot, Dictionary): | ||
continue | ||
if Name.A not in annot or Name.D not in annot[Name.A]: | ||
continue | ||
# We found an annotation that points to a named destination | ||
named_destination = str(annot[Name.A][Name.D]) | ||
if named_destination not in names: | ||
# If there is no corresponding named destination, remove the | ||
# annotation. Having no destination set is still valid and just | ||
# makes the link non-functional. | ||
log.warning( | ||
f"Disabling a hyperlink annotation on page {n + 1} to a " | ||
"non-existent named destination " | ||
f"{named_destination}." | ||
) | ||
del annot[Name.A][Name.D] | ||
modified = True | ||
|
||
return modified |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# SPDX-FileCopyrightText: 2024 James R. Barlow | ||
# SPDX-License-Identifier: MPL-2.0 | ||
|
||
from __future__ import annotations | ||
|
||
import pytest | ||
from pikepdf import Array, Dictionary, Name, NameTree, Pdf | ||
|
||
from ocrmypdf._annots import remove_broken_goto_annotations | ||
|
||
|
||
def test_remove_broken_goto_annotations(resources): | ||
with Pdf.open(resources / 'link.pdf') as pdf: | ||
assert not remove_broken_goto_annotations(pdf), "File should not be modified" | ||
|
||
# Construct Dests nametree | ||
nt = NameTree.new(pdf) | ||
names = pdf.Root[Name.Names] = pdf.make_indirect(Dictionary()) | ||
names[Name.Dests] = nt.obj | ||
# Create a broken named destination | ||
nt['Invalid'] = pdf.make_indirect(Dictionary()) | ||
# Create a valid named destination | ||
nt['Valid'] = Array([pdf.pages[0].obj, Name.XYZ, 0, 0, 0]) | ||
|
||
pdf.pages[0].Annots[0].A.D = 'Missing' | ||
pdf.pages[1].Annots[0].A.D = 'Valid' | ||
|
||
assert remove_broken_goto_annotations(pdf), "File should be modified" | ||
|
||
assert Name.D not in pdf.pages[0].Annots[0].A | ||
assert Name.D in pdf.pages[1].Annots[0].A |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
import hypothesis | ||
import pytest |