diff --git a/tagged-pdf-generation/pom.xml b/tagged-pdf-generation/pom.xml new file mode 100644 index 0000000..a73f3b2 --- /dev/null +++ b/tagged-pdf-generation/pom.xml @@ -0,0 +1,68 @@ + + + + + verapdf-tools + org.verapdf + 1.0-SNAPSHOT + + 4.0.0 + org.verapdf + + tagged-pdf-generation + 1.0-SNAPSHOT + + + UTF-8 + 1.8 + 1.8 + + + + + org.apache.pdfbox + pdfbox + 3.0.2 + + + junit + junit + 4.11 + test + + + + + + + maven-compiler-plugin + + + org.apache.maven.plugins + maven-assembly-plugin + + + + org.verapdf.tools.TaggedPDFGenerator + + + + jar-with-dependencies + + false + + + + make-assembly + package + + single + + + + + + + + diff --git a/tagged-pdf-generation/src/main/java/org/verapdf/tools/PDStructureTreeRootAccess.java b/tagged-pdf-generation/src/main/java/org/verapdf/tools/PDStructureTreeRootAccess.java new file mode 100644 index 0000000..9d54a67 --- /dev/null +++ b/tagged-pdf-generation/src/main/java/org/verapdf/tools/PDStructureTreeRootAccess.java @@ -0,0 +1,21 @@ +package org.verapdf.tools; + +import org.apache.pdfbox.cos.COSInteger; +import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; + +/** + * Allows to add PDMarkedContent in PDStructureTreeRoot + */ +public class PDStructureTreeRootAccess extends PDStructureTreeRoot { + public PDStructureTreeRootAccess() { + super(); + } + + public void appendKid(PDMarkedContent markedContent) { + if (markedContent == null) { + return; + } + this.appendKid(COSInteger.get(markedContent.getMCID())); + } +} diff --git a/tagged-pdf-generation/src/main/java/org/verapdf/tools/StructureType.java b/tagged-pdf-generation/src/main/java/org/verapdf/tools/StructureType.java new file mode 100644 index 0000000..62dafc9 --- /dev/null +++ b/tagged-pdf-generation/src/main/java/org/verapdf/tools/StructureType.java @@ -0,0 +1,70 @@ +package org.verapdf.tools; + +public enum StructureType { + DOCUMENT("Document"), + PART("Part"), + DIV("Div"), + CAPTION("Caption"), + THEAD("THead"), + TBODY("TBody"), + TFOOT("TFoot"), + H("H"), + P("P"), + L("L"), + LI("LI"), + LBL("Lbl"), + LBODY("LBody"), + TABLE("Table"), + TR("TR"), + TH("TH"), + TD("TD"), + SPAN("Span"), + LINK("Link"), + ANNOT("Annot"), + RUBY("Ruby"), + WARICHU("Warichu"), + FIGURE("Figure"), + FORMULA("Formula"), + FORM("Form"), + RB("RB"), + RT("RT"), + RP("RP"), + WT("WT"), + WP("WP"), + ART("Art"), + SECT("Sect"), + BLOCK_QUOTE("BlockQuote"), + TOC("TOC"), + TOCI("TOCI"), + INDEX("Index"), + NON_STRUCT("NonStruct"), + PRIVATE("Private"), + QUOTE("Quote"), + NOTE("Note"), + REFERENCE("Reference"), + BIB_ENTRY("BibEntry"), + CODE("Code"), + H1("H1"), + H2("H2"), + H3("H3"), + H4("H4"), + H5("H5"), + H6("H6"), + DOCUMENT_FRAGMENT("DocumentFragment"), + ASIDE("Aside"), + TITLE("Title"), + FENOTE("FENote"), + SUB("Sub"), + EM("Em"), + STRONG("Strong"), + ARTIFACT("Artifact"); + + private final String text; + private StructureType(String text) { + this.text = text; + } + + public String string() { + return text; + } +} diff --git a/tagged-pdf-generation/src/main/java/org/verapdf/tools/TaggedPDFGenerator.java b/tagged-pdf-generation/src/main/java/org/verapdf/tools/TaggedPDFGenerator.java new file mode 100644 index 0000000..b95d6e3 --- /dev/null +++ b/tagged-pdf-generation/src/main/java/org/verapdf/tools/TaggedPDFGenerator.java @@ -0,0 +1,195 @@ +package org.verapdf.tools; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Paths; + +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdfwriter.compress.CompressParameters; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentCatalog; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode; +import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement; +import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; +import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDPropertyList; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts.FontName; + +import java.util.TreeSet; +import java.util.logging.Logger; +import java.util.HashMap; +import java.util.Set; + +public class TaggedPDFGenerator { + private static Logger logger = Logger.getLogger(""); + private static Set types = new TreeSet(); + private static Set typesExcludeTransit = new TreeSet(); + private static Set transitionalTypes = new TreeSet(); + private static HashMap files = new HashMap(); + static { + transitionalTypes.add(StructureType.DIV.string()); + transitionalTypes.add(StructureType.NON_STRUCT.string()); + transitionalTypes.add(StructureType.PART.string()); + + for (StructureType type : StructureType.values()) { + types.add(type.string()); + typesExcludeTransit.add(type.string()); + } + + for (String type : transitionalTypes) { + typesExcludeTransit.remove(type); + } + + files.put(StructureType.DIV.string(), "transitionaltag_div_test"); + files.put(StructureType.NON_STRUCT.string(), "transitionaltag_nonstruct_test"); + files.put(StructureType.PART.string(), "transitionaltag_part_test"); + files.put("all_inclusions", "all_inclusions_test"); + } + + public static void main(String[] args) { + TaggedPDFGenerator taggedPDFGenerator = new TaggedPDFGenerator(); + + try { + taggedPDFGenerator.run(args); + } catch (Exception ex) { + logger.severe("Error during pdf generation: " + ex.getMessage() + ", proccess stopped."); + ex.printStackTrace(); + } + } + + private Integer currentMCID = 1; + private String folder; + + private String getWorkingDir() { + if (folder != null && (new File(folder)).exists()) { + return folder; + } + + File file = new File(System.getProperty("user.dir") + "\\generated_files"); + file.mkdirs(); + + return file.getAbsolutePath(); + } + + private void run(String[] args) throws IOException { + if (args.length > 0){ + folder = args[0]; + } + + String parentFolder = getWorkingDir(); + + for (String type : files.keySet()) { + currentMCID = 1; + + PDDocument document; + if (type.equals("all_inclusions")) { + document = allInclusionsPDF(type); + } else { + document = transitionalPDF(type); + } + + File file = Paths.get(parentFolder).resolve(files.get(type) + ".pdf").toFile(); + + document.save(file, CompressParameters.NO_COMPRESSION); + document.close(); + } + } + + private PDDocument allInclusionsPDF(String pdftype) throws IOException { + PDDocument document = new PDDocument(); + PDPage page = new PDPage(); + PDDocumentCatalog catalog = document.getDocumentCatalog(); + PDStructureTreeRootAccess treeRoot = new PDStructureTreeRootAccess(); + + document.addPage(page); + catalog.setStructureTreeRoot(treeRoot); + + PDPageContentStream content = new PDPageContentStream(document, page, AppendMode.OVERWRITE, false); + + treeRoot.appendKid(textContent(content, " ")); + treeRoot.appendKid(textContent(content, " ")); + for (String type : typesExcludeTransit) { + PDStructureElement element = new PDStructureElement(type, treeRoot); + + element.appendKid(textContent(content, " ")); + element.appendKid(textContent(content, " ")); + for (String subType : typesExcludeTransit) { + for (Integer index = 0; index < 2; index++) { + PDStructureElement subElement = new PDStructureElement(subType, element); + subElement.setPage(page); + + element.appendKid(subElement); + } + } + + element.setPage(page); + treeRoot.appendKid(element); + + element = new PDStructureElement(type, treeRoot); + element.setPage(page); + treeRoot.appendKid(element); + } + + content.close(); + + return document; + } + + private PDDocument transitionalPDF(String transitionalType) throws IOException { + PDDocument document = new PDDocument(); + PDPage page = new PDPage(); + PDDocumentCatalog catalog = document.getDocumentCatalog(); + PDStructureTreeRoot treeRoot = new PDStructureTreeRoot(); + + document.addPage(page); + catalog.setStructureTreeRoot(treeRoot); + + PDPageContentStream content = new PDPageContentStream(document, page, AppendMode.OVERWRITE, false); + + for (String type : types) { + PDStructureElement element = new PDStructureElement(type, treeRoot); + PDStructureElement transitionalElement = new PDStructureElement(transitionalType, element); + + transitionalElement.appendKid(textContent(content, " ")); + transitionalElement.appendKid(textContent(content, " ")); + for (String subType : types) { + for (Integer index = 0; index < 2; index++) { + PDStructureElement sub_element = new PDStructureElement(subType, transitionalElement); + sub_element.setPage(page); + + transitionalElement.appendKid(sub_element); + } + } + + element.setPage(page); + element.appendKid(transitionalElement); + + transitionalElement = new PDStructureElement(transitionalType, element); + element.appendKid(transitionalElement); + treeRoot.appendKid(element); + } + + content.close(); + + return document; + } + + private PDMarkedContent textContent(PDPageContentStream content, String text) throws IOException { + content.beginText(); + content.setFont(new PDType1Font(FontName.HELVETICA_BOLD), 14); + COSDictionary dictionary = new COSDictionary(); + dictionary.setInt(COSName.MCID, currentMCID); + currentMCID++; + content.beginMarkedContent(COSName.P, PDPropertyList.create(dictionary)); + content.showText(text); + content.endMarkedContent(); + PDMarkedContent markedContent = new PDMarkedContent(COSName.P, dictionary); + content.endText(); + + return markedContent; + } +}