-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Regression tests generator for ISO 32005 (Tagged PDF)
- Loading branch information
1 parent
7c4a6da
commit 6680fd6
Showing
4 changed files
with
354 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
|
||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<parent> | ||
<artifactId>verapdf-tools</artifactId> | ||
<groupId>org.verapdf</groupId> | ||
<version>1.0-SNAPSHOT</version> | ||
</parent> | ||
<modelVersion>4.0.0</modelVersion> | ||
<groupId>org.verapdf</groupId> | ||
|
||
<artifactId>tagged-pdf-generation</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
|
||
<properties> | ||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
<maven.compiler.source>1.8</maven.compiler.source> | ||
<maven.compiler.target>1.8</maven.compiler.target> | ||
</properties> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>org.apache.pdfbox</groupId> | ||
<artifactId>pdfbox</artifactId> | ||
<version>3.0.2</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>junit</groupId> | ||
<artifactId>junit</artifactId> | ||
<version>4.11</version> | ||
<scope>test</scope> | ||
</dependency> | ||
</dependencies> | ||
|
||
<build> | ||
<plugins> | ||
<plugin> | ||
<artifactId>maven-compiler-plugin</artifactId> | ||
</plugin> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-assembly-plugin</artifactId> | ||
<configuration> | ||
<archive> | ||
<manifest> | ||
<mainClass>org.verapdf.tools.TaggedPDFGenerator</mainClass> | ||
</manifest> | ||
</archive> | ||
<descriptorRefs> | ||
<descriptorRef>jar-with-dependencies</descriptorRef> | ||
</descriptorRefs> | ||
<appendAssemblyId>false</appendAssemblyId> | ||
</configuration> | ||
<executions> | ||
<execution> | ||
<id>make-assembly</id> <!-- this is used for inheritance merges --> | ||
<phase>package</phase> <!-- bind to the packaging phase --> | ||
<goals> | ||
<goal>single</goal> | ||
</goals> | ||
</execution> | ||
</executions> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
|
||
</project> |
21 changes: 21 additions & 0 deletions
21
tagged-pdf-generation/src/main/java/org/verapdf/tools/PDStructureTreeRootAccess.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package org.verapdf.tools; | ||
|
||
import org.apache.pdfbox.cos.COSInteger; | ||
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot; | ||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; | ||
|
||
/** | ||
* Allows to add PDMarkedContent in PDStructureTreeRoot | ||
*/ | ||
public class PDStructureTreeRootAccess extends PDStructureTreeRoot { | ||
public PDStructureTreeRootAccess() { | ||
super(); | ||
} | ||
|
||
public void appendKid(PDMarkedContent markedContent) { | ||
if (markedContent == null) { | ||
return; | ||
} | ||
this.appendKid(COSInteger.get(markedContent.getMCID())); | ||
} | ||
} |
70 changes: 70 additions & 0 deletions
70
tagged-pdf-generation/src/main/java/org/verapdf/tools/StructureType.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
package org.verapdf.tools; | ||
|
||
public enum StructureType { | ||
DOCUMENT("Document"), | ||
PART("Part"), | ||
DIV("Div"), | ||
CAPTION("Caption"), | ||
THEAD("THead"), | ||
TBODY("TBody"), | ||
TFOOT("TFoot"), | ||
H("H"), | ||
P("P"), | ||
L("L"), | ||
LI("LI"), | ||
LBL("Lbl"), | ||
LBODY("LBody"), | ||
TABLE("Table"), | ||
TR("TR"), | ||
TH("TH"), | ||
TD("TD"), | ||
SPAN("Span"), | ||
LINK("Link"), | ||
ANNOT("Annot"), | ||
RUBY("Ruby"), | ||
WARICHU("Warichu"), | ||
FIGURE("Figure"), | ||
FORMULA("Formula"), | ||
FORM("Form"), | ||
RB("RB"), | ||
RT("RT"), | ||
RP("RP"), | ||
WT("WT"), | ||
WP("WP"), | ||
ART("Art"), | ||
SECT("Sect"), | ||
BLOCK_QUOTE("BlockQuote"), | ||
TOC("TOC"), | ||
TOCI("TOCI"), | ||
INDEX("Index"), | ||
NON_STRUCT("NonStruct"), | ||
PRIVATE("Private"), | ||
QUOTE("Quote"), | ||
NOTE("Note"), | ||
REFERENCE("Reference"), | ||
BIB_ENTRY("BibEntry"), | ||
CODE("Code"), | ||
H1("H1"), | ||
H2("H2"), | ||
H3("H3"), | ||
H4("H4"), | ||
H5("H5"), | ||
H6("H6"), | ||
DOCUMENT_FRAGMENT("DocumentFragment"), | ||
ASIDE("Aside"), | ||
TITLE("Title"), | ||
FENOTE("FENote"), | ||
SUB("Sub"), | ||
EM("Em"), | ||
STRONG("Strong"), | ||
ARTIFACT("Artifact"); | ||
|
||
private final String text; | ||
private StructureType(String text) { | ||
this.text = text; | ||
} | ||
|
||
public String string() { | ||
return text; | ||
} | ||
} |
195 changes: 195 additions & 0 deletions
195
tagged-pdf-generation/src/main/java/org/verapdf/tools/TaggedPDFGenerator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
package org.verapdf.tools; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.nio.file.Paths; | ||
|
||
import org.apache.pdfbox.cos.COSDictionary; | ||
import org.apache.pdfbox.cos.COSName; | ||
import org.apache.pdfbox.pdfwriter.compress.CompressParameters; | ||
import org.apache.pdfbox.pdmodel.PDDocument; | ||
import org.apache.pdfbox.pdmodel.PDDocumentCatalog; | ||
import org.apache.pdfbox.pdmodel.PDPage; | ||
import org.apache.pdfbox.pdmodel.PDPageContentStream; | ||
import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode; | ||
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement; | ||
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot; | ||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; | ||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDPropertyList; | ||
import org.apache.pdfbox.pdmodel.font.PDType1Font; | ||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts.FontName; | ||
|
||
import java.util.TreeSet; | ||
import java.util.logging.Logger; | ||
import java.util.HashMap; | ||
import java.util.Set; | ||
|
||
public class TaggedPDFGenerator { | ||
private static Logger logger = Logger.getLogger(""); | ||
private static Set<String> types = new TreeSet<String>(); | ||
private static Set<String> typesExcludeTransit = new TreeSet<String>(); | ||
private static Set<String> transitionalTypes = new TreeSet<String>(); | ||
private static HashMap<String, String> files = new HashMap<String, String>(); | ||
static { | ||
transitionalTypes.add(StructureType.DIV.string()); | ||
transitionalTypes.add(StructureType.NON_STRUCT.string()); | ||
transitionalTypes.add(StructureType.PART.string()); | ||
|
||
for (StructureType type : StructureType.values()) { | ||
types.add(type.string()); | ||
typesExcludeTransit.add(type.string()); | ||
} | ||
|
||
for (String type : transitionalTypes) { | ||
typesExcludeTransit.remove(type); | ||
} | ||
|
||
files.put(StructureType.DIV.string(), "transitionaltag_div_test"); | ||
files.put(StructureType.NON_STRUCT.string(), "transitionaltag_nonstruct_test"); | ||
files.put(StructureType.PART.string(), "transitionaltag_part_test"); | ||
files.put("all_inclusions", "all_inclusions_test"); | ||
} | ||
|
||
public static void main(String[] args) { | ||
TaggedPDFGenerator taggedPDFGenerator = new TaggedPDFGenerator(); | ||
|
||
try { | ||
taggedPDFGenerator.run(args); | ||
} catch (Exception ex) { | ||
logger.severe("Error during pdf generation: " + ex.getMessage() + ", proccess stopped."); | ||
ex.printStackTrace(); | ||
} | ||
} | ||
|
||
private Integer currentMCID = 1; | ||
private String folder; | ||
|
||
private String getWorkingDir() { | ||
if (folder != null && (new File(folder)).exists()) { | ||
return folder; | ||
} | ||
|
||
File file = new File(System.getProperty("user.dir") + "\\generated_files"); | ||
file.mkdirs(); | ||
|
||
return file.getAbsolutePath(); | ||
} | ||
|
||
private void run(String[] args) throws IOException { | ||
if (args.length > 0){ | ||
folder = args[0]; | ||
} | ||
|
||
String parentFolder = getWorkingDir(); | ||
|
||
for (String type : files.keySet()) { | ||
currentMCID = 1; | ||
|
||
PDDocument document; | ||
if (type.equals("all_inclusions")) { | ||
document = allInclusionsPDF(type); | ||
} else { | ||
document = transitionalPDF(type); | ||
} | ||
|
||
File file = Paths.get(parentFolder).resolve(files.get(type) + ".pdf").toFile(); | ||
|
||
document.save(file, CompressParameters.NO_COMPRESSION); | ||
document.close(); | ||
} | ||
} | ||
|
||
private PDDocument allInclusionsPDF(String pdftype) throws IOException { | ||
PDDocument document = new PDDocument(); | ||
PDPage page = new PDPage(); | ||
PDDocumentCatalog catalog = document.getDocumentCatalog(); | ||
PDStructureTreeRootAccess treeRoot = new PDStructureTreeRootAccess(); | ||
|
||
document.addPage(page); | ||
catalog.setStructureTreeRoot(treeRoot); | ||
|
||
PDPageContentStream content = new PDPageContentStream(document, page, AppendMode.OVERWRITE, false); | ||
|
||
treeRoot.appendKid(textContent(content, " ")); | ||
treeRoot.appendKid(textContent(content, " ")); | ||
for (String type : typesExcludeTransit) { | ||
PDStructureElement element = new PDStructureElement(type, treeRoot); | ||
|
||
element.appendKid(textContent(content, " ")); | ||
element.appendKid(textContent(content, " ")); | ||
for (String subType : typesExcludeTransit) { | ||
for (Integer index = 0; index < 2; index++) { | ||
PDStructureElement subElement = new PDStructureElement(subType, element); | ||
subElement.setPage(page); | ||
|
||
element.appendKid(subElement); | ||
} | ||
} | ||
|
||
element.setPage(page); | ||
treeRoot.appendKid(element); | ||
|
||
element = new PDStructureElement(type, treeRoot); | ||
element.setPage(page); | ||
treeRoot.appendKid(element); | ||
} | ||
|
||
content.close(); | ||
|
||
return document; | ||
} | ||
|
||
private PDDocument transitionalPDF(String transitionalType) throws IOException { | ||
PDDocument document = new PDDocument(); | ||
PDPage page = new PDPage(); | ||
PDDocumentCatalog catalog = document.getDocumentCatalog(); | ||
PDStructureTreeRoot treeRoot = new PDStructureTreeRoot(); | ||
|
||
document.addPage(page); | ||
catalog.setStructureTreeRoot(treeRoot); | ||
|
||
PDPageContentStream content = new PDPageContentStream(document, page, AppendMode.OVERWRITE, false); | ||
|
||
for (String type : types) { | ||
PDStructureElement element = new PDStructureElement(type, treeRoot); | ||
PDStructureElement transitionalElement = new PDStructureElement(transitionalType, element); | ||
|
||
transitionalElement.appendKid(textContent(content, " ")); | ||
transitionalElement.appendKid(textContent(content, " ")); | ||
for (String subType : types) { | ||
for (Integer index = 0; index < 2; index++) { | ||
PDStructureElement sub_element = new PDStructureElement(subType, transitionalElement); | ||
sub_element.setPage(page); | ||
|
||
transitionalElement.appendKid(sub_element); | ||
} | ||
} | ||
|
||
element.setPage(page); | ||
element.appendKid(transitionalElement); | ||
|
||
transitionalElement = new PDStructureElement(transitionalType, element); | ||
element.appendKid(transitionalElement); | ||
treeRoot.appendKid(element); | ||
} | ||
|
||
content.close(); | ||
|
||
return document; | ||
} | ||
|
||
private PDMarkedContent textContent(PDPageContentStream content, String text) throws IOException { | ||
content.beginText(); | ||
content.setFont(new PDType1Font(FontName.HELVETICA_BOLD), 14); | ||
COSDictionary dictionary = new COSDictionary(); | ||
dictionary.setInt(COSName.MCID, currentMCID); | ||
currentMCID++; | ||
content.beginMarkedContent(COSName.P, PDPropertyList.create(dictionary)); | ||
content.showText(text); | ||
content.endMarkedContent(); | ||
PDMarkedContent markedContent = new PDMarkedContent(COSName.P, dictionary); | ||
content.endText(); | ||
|
||
return markedContent; | ||
} | ||
} |