Skip to content

Commit

Permalink
Regression tests generator for ISO 32005 (Tagged PDF)
Browse files Browse the repository at this point in the history
  • Loading branch information
ProxyNexus authored and MaximPlusov committed Apr 17, 2024
1 parent 7c4a6da commit 6680fd6
Show file tree
Hide file tree
Showing 4 changed files with 354 additions and 0 deletions.
68 changes: 68 additions & 0 deletions tagged-pdf-generation/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>verapdf-tools</artifactId>
<groupId>org.verapdf</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>org.verapdf</groupId>

<artifactId>tagged-pdf-generation</artifactId>
<version>1.0-SNAPSHOT</version>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>

<dependencies>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.2</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>org.verapdf.tools.TaggedPDFGenerator</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<appendAssemblyId>false</appendAssemblyId>
</configuration>
<executions>
<execution>
<id>make-assembly</id> <!-- this is used for inheritance merges -->
<phase>package</phase> <!-- bind to the packaging phase -->
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package org.verapdf.tools;

import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;

/**
* Allows to add PDMarkedContent in PDStructureTreeRoot
*/
public class PDStructureTreeRootAccess extends PDStructureTreeRoot {
public PDStructureTreeRootAccess() {
super();
}

public void appendKid(PDMarkedContent markedContent) {
if (markedContent == null) {
return;
}
this.appendKid(COSInteger.get(markedContent.getMCID()));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package org.verapdf.tools;

public enum StructureType {
DOCUMENT("Document"),
PART("Part"),
DIV("Div"),
CAPTION("Caption"),
THEAD("THead"),
TBODY("TBody"),
TFOOT("TFoot"),
H("H"),
P("P"),
L("L"),
LI("LI"),
LBL("Lbl"),
LBODY("LBody"),
TABLE("Table"),
TR("TR"),
TH("TH"),
TD("TD"),
SPAN("Span"),
LINK("Link"),
ANNOT("Annot"),
RUBY("Ruby"),
WARICHU("Warichu"),
FIGURE("Figure"),
FORMULA("Formula"),
FORM("Form"),
RB("RB"),
RT("RT"),
RP("RP"),
WT("WT"),
WP("WP"),
ART("Art"),
SECT("Sect"),
BLOCK_QUOTE("BlockQuote"),
TOC("TOC"),
TOCI("TOCI"),
INDEX("Index"),
NON_STRUCT("NonStruct"),
PRIVATE("Private"),
QUOTE("Quote"),
NOTE("Note"),
REFERENCE("Reference"),
BIB_ENTRY("BibEntry"),
CODE("Code"),
H1("H1"),
H2("H2"),
H3("H3"),
H4("H4"),
H5("H5"),
H6("H6"),
DOCUMENT_FRAGMENT("DocumentFragment"),
ASIDE("Aside"),
TITLE("Title"),
FENOTE("FENote"),
SUB("Sub"),
EM("Em"),
STRONG("Strong"),
ARTIFACT("Artifact");

private final String text;
private StructureType(String text) {
this.text = text;
}

public String string() {
return text;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
package org.verapdf.tools;

import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;

import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDPropertyList;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts.FontName;

import java.util.TreeSet;
import java.util.logging.Logger;
import java.util.HashMap;
import java.util.Set;

public class TaggedPDFGenerator {
private static Logger logger = Logger.getLogger("");
private static Set<String> types = new TreeSet<String>();
private static Set<String> typesExcludeTransit = new TreeSet<String>();
private static Set<String> transitionalTypes = new TreeSet<String>();
private static HashMap<String, String> files = new HashMap<String, String>();
static {
transitionalTypes.add(StructureType.DIV.string());
transitionalTypes.add(StructureType.NON_STRUCT.string());
transitionalTypes.add(StructureType.PART.string());

for (StructureType type : StructureType.values()) {
types.add(type.string());
typesExcludeTransit.add(type.string());
}

for (String type : transitionalTypes) {
typesExcludeTransit.remove(type);
}

files.put(StructureType.DIV.string(), "transitionaltag_div_test");
files.put(StructureType.NON_STRUCT.string(), "transitionaltag_nonstruct_test");
files.put(StructureType.PART.string(), "transitionaltag_part_test");
files.put("all_inclusions", "all_inclusions_test");
}

public static void main(String[] args) {
TaggedPDFGenerator taggedPDFGenerator = new TaggedPDFGenerator();

try {
taggedPDFGenerator.run(args);
} catch (Exception ex) {
logger.severe("Error during pdf generation: " + ex.getMessage() + ", proccess stopped.");
ex.printStackTrace();
}
}

private Integer currentMCID = 1;
private String folder;

private String getWorkingDir() {
if (folder != null && (new File(folder)).exists()) {
return folder;
}

File file = new File(System.getProperty("user.dir") + "\\generated_files");
file.mkdirs();

return file.getAbsolutePath();
}

private void run(String[] args) throws IOException {
if (args.length > 0){
folder = args[0];
}

String parentFolder = getWorkingDir();

for (String type : files.keySet()) {
currentMCID = 1;

PDDocument document;
if (type.equals("all_inclusions")) {
document = allInclusionsPDF(type);
} else {
document = transitionalPDF(type);
}

File file = Paths.get(parentFolder).resolve(files.get(type) + ".pdf").toFile();

document.save(file, CompressParameters.NO_COMPRESSION);
document.close();
}
}

private PDDocument allInclusionsPDF(String pdftype) throws IOException {
PDDocument document = new PDDocument();
PDPage page = new PDPage();
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDStructureTreeRootAccess treeRoot = new PDStructureTreeRootAccess();

document.addPage(page);
catalog.setStructureTreeRoot(treeRoot);

PDPageContentStream content = new PDPageContentStream(document, page, AppendMode.OVERWRITE, false);

treeRoot.appendKid(textContent(content, " "));
treeRoot.appendKid(textContent(content, " "));
for (String type : typesExcludeTransit) {
PDStructureElement element = new PDStructureElement(type, treeRoot);

element.appendKid(textContent(content, " "));
element.appendKid(textContent(content, " "));
for (String subType : typesExcludeTransit) {
for (Integer index = 0; index < 2; index++) {
PDStructureElement subElement = new PDStructureElement(subType, element);
subElement.setPage(page);

element.appendKid(subElement);
}
}

element.setPage(page);
treeRoot.appendKid(element);

element = new PDStructureElement(type, treeRoot);
element.setPage(page);
treeRoot.appendKid(element);
}

content.close();

return document;
}

private PDDocument transitionalPDF(String transitionalType) throws IOException {
PDDocument document = new PDDocument();
PDPage page = new PDPage();
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDStructureTreeRoot treeRoot = new PDStructureTreeRoot();

document.addPage(page);
catalog.setStructureTreeRoot(treeRoot);

PDPageContentStream content = new PDPageContentStream(document, page, AppendMode.OVERWRITE, false);

for (String type : types) {
PDStructureElement element = new PDStructureElement(type, treeRoot);
PDStructureElement transitionalElement = new PDStructureElement(transitionalType, element);

transitionalElement.appendKid(textContent(content, " "));
transitionalElement.appendKid(textContent(content, " "));
for (String subType : types) {
for (Integer index = 0; index < 2; index++) {
PDStructureElement sub_element = new PDStructureElement(subType, transitionalElement);
sub_element.setPage(page);

transitionalElement.appendKid(sub_element);
}
}

element.setPage(page);
element.appendKid(transitionalElement);

transitionalElement = new PDStructureElement(transitionalType, element);
element.appendKid(transitionalElement);
treeRoot.appendKid(element);
}

content.close();

return document;
}

private PDMarkedContent textContent(PDPageContentStream content, String text) throws IOException {
content.beginText();
content.setFont(new PDType1Font(FontName.HELVETICA_BOLD), 14);
COSDictionary dictionary = new COSDictionary();
dictionary.setInt(COSName.MCID, currentMCID);
currentMCID++;
content.beginMarkedContent(COSName.P, PDPropertyList.create(dictionary));
content.showText(text);
content.endMarkedContent();
PDMarkedContent markedContent = new PDMarkedContent(COSName.P, dictionary);
content.endText();

return markedContent;
}
}

0 comments on commit 6680fd6

Please sign in to comment.