Skip to content

Commit

Permalink
Use PDFBox 3 as default text converter; fallback to 1.8 in case of er…
Browse files Browse the repository at this point in the history
…rors

Issue: #4449
  • Loading branch information
buchen committed Jan 3, 2025
1 parent ebcc63e commit 05ea668
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ public void execute(@Named(IServiceConstants.ACTIVE_PART) MPart part,

StringBuilder textBuilder = new StringBuilder();
textBuilder.append("```").append("\n");
textBuilder.append("PDFBox Version: ")
.append(inputFile.getPDFBoxVersion().toString()) //
textBuilder.append("PDFBox Version: ").append(inputFile.getPDFBoxVersion()) //
.append("\n");
textBuilder.append("Portfolio Performance Version: ")
.append(PortfolioPlugin.getDefault().getBundle().getVersion().toString()) //
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.eclipse.core.runtime.IProgressMonitor;

import name.abuchen.portfolio.Messages;
import name.abuchen.portfolio.PortfolioLog;
import name.abuchen.portfolio.datatransfer.Extractor;
import name.abuchen.portfolio.datatransfer.Extractor.Item;
import name.abuchen.portfolio.datatransfer.SecurityCache;
Expand Down Expand Up @@ -164,6 +165,27 @@ public Map<Extractor, List<Item>> run(IProgressMonitor monitor, Map<File, List<E
}
}

if (!extracted)
{
inputFile.convertLegacyPDFtoText();
for (Extractor extractor : extractors)
{
List<Item> items = extractor.extract(securityCache, inputFile, warnings);

if (!items.isEmpty())
{
extracted = true;
itemsByExtractor.computeIfAbsent(extractor, e -> new ArrayList<Item>()).addAll(items);
break;
}
}

if (extracted)
{
PortfolioLog.info("PDF successfully imported with legacy method " + inputFile.getName());
}
}

if (!extracted)
{
Predicate<? super Exception> isNotUnsupportedOperation = e -> !(e instanceof UnsupportedOperationException);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,14 @@
import java.util.List;
import java.util.Scanner;

import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.osgi.framework.FrameworkUtil;
import org.osgi.framework.Version;

import name.abuchen.portfolio.datatransfer.Extractor;
import name.abuchen.portfolio.pdfbox1.PDFBox1;
import name.abuchen.portfolio.pdfbox3.PDFBox3;

public class PDFInputFile extends Extractor.InputFile
{
private String text;
private String version;

public PDFInputFile(File file)
{
Expand Down Expand Up @@ -62,32 +59,21 @@ public String getText()
return text;
}

public Version getPDFBoxVersion()
public String getPDFBoxVersion()
{
return FrameworkUtil.getBundle(PDDocument.class).getVersion();
return version;
}

public void convertPDFtoText() throws IOException
{
try (PDDocument document = PDDocument.load(getFile()))
{
boolean isProtected = document.isEncrypted();
if (isProtected)
{
document.decrypt(""); //$NON-NLS-1$
document.setAllSecurityToBeRemoved(true);
}

PDFTextStripper textStripper = new PDFTextStripper();
textStripper.setSortByPosition(true);
text = textStripper.getText(document);
text = new PDFBox3().convertToText(getFile());
version = new PDFBox3().getPDFBoxVersion();
}

text = withoutHorizontalWhitespace(text);
}
catch (CryptographyException e)
{
throw new IOException(e);
}
public void convertLegacyPDFtoText() throws IOException
{
text = new PDFBox1().convertToText(getFile());
version = new PDFBox1().getPDFBoxVersion();
}

private String withoutHorizontalWhitespace(String s)
Expand Down

0 comments on commit 05ea668

Please sign in to comment.