Skip to content

Commit

Permalink
Partially fix Pdfium PDF/A compliance and improve tests
Browse files Browse the repository at this point in the history
  • Loading branch information
cyanfish committed Apr 6, 2024
1 parent 8e3ea2a commit 3490a52
Show file tree
Hide file tree
Showing 7 changed files with 113 additions and 30 deletions.
13 changes: 13 additions & 0 deletions NAPS2.Sdk.Tests/Asserts/PdfAsserts.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ public static void AssertPageSize(PageSize pageSize, int precision, string fileP

public static async Task AssertCompliant(string profile, string filePath)
{
if (string.IsNullOrEmpty(profile))
{
return;
}
Assert.True(File.Exists(filePath));
var report = await LazyPdfAValidator.Value.ValidateWithDetailedReportAsync(filePath);
Assert.True(report.Jobs.Job.ValidationReport.IsCompliant);
Expand Down Expand Up @@ -128,4 +132,13 @@ public static void AssertImageFilter(string filePath, int pageIndex, params stri
$"Expected filters: {string.Join(",", filters)}, actual: {string.Join(",", obj.GetImageFilters())}");
}
}

public static void AssertVersion(int version, string filePath)
{
lock (PdfiumNativeLibrary.Instance)
{
using var doc = PdfDocument.Load(filePath);
Assert.Equal(version, doc.Version);
}
}
}
85 changes: 63 additions & 22 deletions NAPS2.Sdk.Tests/Pdf/PdfATests.cs
Original file line number Diff line number Diff line change
@@ -1,35 +1,76 @@
using NAPS2.Ocr;
using NAPS2.Pdf;
using NAPS2.Sdk.Tests.Asserts;
using Xunit;

namespace NAPS2.Sdk.Tests.Pdf;

// TODO: Validate with OCR output
// TODO: Maaaybe validate with external import? We certainly can't guarantee it, but maybe some cases can be verified for best effort
public class PdfATests : ContextualTests
{
// Sadly the pdfa verifier library only supports windows/mac
[PlatformFact(exclude: PlatformFlags.Mac)]
public async Task Validate()
private readonly PdfExporter _pdfExporter;
private readonly string _path;
private readonly string _importPath;

public PdfATests()
{
var pdfExporter = new PdfExporter(ScanningContext);
var testCases = new (PdfCompat pdfCompat, string profile, string fileName)[]
_pdfExporter = new PdfExporter(ScanningContext);
_path = Path.Combine(FolderPath, "test.pdf");
_importPath = CopyResourceToFile(PdfResources.word_patcht_pdf, "word.pdf");
}

// Sadly the pdfa verifier library only supports windows/linux
[PlatformTheory(exclude: PlatformFlags.Mac)]
[MemberData(nameof(TestCases))]
public async Task Validate(PdfCompat pdfCompat, string profile, int version)
{
await _pdfExporter.Export(_path, new[] { CreateScannedImage() }, new PdfExportParams
{
(PdfCompat.PdfA1B, "PDF/A-1B", "pdfa1b_test.pdf"),
(PdfCompat.PdfA2B, "PDF/A-2B", "pdfa2b_test.pdf"),
(PdfCompat.PdfA3B, "PDF/A-3B", "pdfa3b_test.pdf"),
(PdfCompat.PdfA3U, "PDF/A-3U", "pdfa3u_test.pdf")
};
Compat = pdfCompat
});

var tasks = testCases.Select(testCase =>
PdfAsserts.AssertVersion(version, _path);
await PdfAsserts.AssertCompliant(profile, _path);
}

[PlatformTheory(exclude: PlatformFlags.Mac)]
[MemberData(nameof(TestCases))]
public async Task ValidateWithOcr(PdfCompat pdfCompat, string profile, int version)
{
SetUpFakeOcr(ifNoMatch: "hello world");

await _pdfExporter.Export(_path, new[] { CreateScannedImage() }, new PdfExportParams
{
using var image = CreateScannedImage();
var path = Path.Combine(FolderPath, testCase.fileName);
pdfExporter.Export(path, new[] { image }, new PdfExportParams
{
Compat = testCase.pdfCompat
}).Wait();
return PdfAsserts.AssertCompliant(testCase.profile, path);
}).ToArray();
await Task.WhenAll(tasks);
Compat = pdfCompat
}, new OcrParams("eng"));

PdfAsserts.AssertVersion(version, _path);
await PdfAsserts.AssertCompliant(profile, _path);
}

[PlatformTheory(exclude: PlatformFlags.Mac)]
[MemberData(nameof(TestCases))]
public async Task ValidateWithPdfium(PdfCompat pdfCompat, string profile, int version)
{
var images = await new PdfImporter(ScanningContext).Import(_importPath).ToListAsync();

await _pdfExporter.Export(_path, images, new PdfExportParams
{
Compat = pdfCompat
});

PdfAsserts.AssertVersion(version, _path);
await PdfAsserts.AssertCompliant(profile, _path);
}

// Note that we don't have a Pdfium OCR test as we fail compliance due to the way Pdfium embeds fonts, which isn't
// practical to fix.

public static IEnumerable<object[]> TestCases =
[
[PdfCompat.Default, "", 14],
[PdfCompat.PdfA1B, "PDF/A-1B", 14],
[PdfCompat.PdfA2B, "PDF/A-2B", 17],
[PdfCompat.PdfA3B, "PDF/A-3B", 17],
[PdfCompat.PdfA3U, "PDF/A-3U", 17]
];
}
19 changes: 19 additions & 0 deletions NAPS2.Sdk.Tests/PlatformTheoryAttribute.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using System.Runtime.InteropServices;
using Xunit;

namespace NAPS2.Sdk.Tests;

public sealed class PlatformTheoryAttribute : TheoryAttribute
{
public PlatformTheoryAttribute(PlatformFlags include = PlatformFlags.None, PlatformFlags exclude = PlatformFlags.None)
{
if (include != PlatformFlags.None && (CurrentPlatformFlags.Get() & include) != include)
{
Skip = $"Only runs on platform(s): {include}";
}
if (exclude != PlatformFlags.None && (CurrentPlatformFlags.Get() & exclude) != PlatformFlags.None)
{
Skip = $"Doesn't run on platform(s): {exclude}";
}
}
}
9 changes: 5 additions & 4 deletions NAPS2.Sdk/Pdf/PdfAHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ namespace NAPS2.Pdf;

internal static class PdfAHelper
{
public static void CreateXmpMetadata(PdfDocument document, PdfCompat compat)
public static void CreateXmpMetadata(PdfDocument document, PdfCompat compat, string producer)
{
var metadataDict = new PdfDictionary(document);
metadataDict.Elements["/Type"] = new PdfName("/Metadata");
metadataDict.Elements["/Subtype"] = new PdfName("/XML");
metadataDict.CreateStream(CreateRawXmpMetadata(document.Info, GetConformance(compat)));
metadataDict.CreateStream(CreateRawXmpMetadata(document.Info, GetConformance(compat), producer));
document.Internals.AddObject(metadataDict);
document.Internals.Catalog.Elements["/Metadata"] = metadataDict.Reference;
}
Expand All @@ -33,7 +33,8 @@ private static (string, string) GetConformance(PdfCompat compat)
}
}

private static byte[] CreateRawXmpMetadata(PdfDocumentInformation info, (string, string) conformance)
private static byte[] CreateRawXmpMetadata(PdfDocumentInformation info, (string, string) conformance,
string producer)
{
string xml = $@"<?xpacket begin=""{'\ufeff'}"" id=""W5M0MpCehiHzreSzNTczkc9d""?>
<x:xmpmeta xmlns:x=""adobe:ns:meta/"" x:xmptk=""Adobe XMP Core 5.1.0-jc003"">
Expand All @@ -45,7 +46,7 @@ private static byte[] CreateRawXmpMetadata(PdfDocumentInformation info, (string,
xmlns:pdfaid=""http://www.aiim.org/pdfa/ns/id/""
dc:format=""application/pdf""
pdf:Keywords=""{info.Keywords}""
pdf:Producer=""{PdfSharpCore.ProductVersionInfo.Producer}""
pdf:Producer=""{producer}""
xmp:CreateDate=""{info.CreationDate:yyyy'-'MM'-'dd'T'HH':'mm':'ssK}""
xmp:ModifyDate=""{info.ModificationDate:yyyy'-'MM'-'dd'T'HH':'mm':'ssK}""
xmp:CreatorTool=""{info.Creator}""
Expand Down
12 changes: 8 additions & 4 deletions NAPS2.Sdk/Pdf/PdfExporter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ public class PdfExporter
{
private const int PDF_VERSION_14 = 14;
private const int PDF_VERSION_17 = 17;
private const string PDFIUM_PRODUCER = "PDFium";

private readonly ScanningContext _scanningContext;
private readonly ILogger _logger;
Expand Down Expand Up @@ -133,8 +134,9 @@ void IncrementProgress()
await pdfPagesOcrPipeline;
if (progress.IsCancellationRequested) return false;

var producer = pdfPages.Any() ? PDFIUM_PRODUCER : PdfSharpCore.ProductVersionInfo.Producer;
// TODO: Doing in memory as that's presumably faster than IO, but of course that's quite a bit of memory use potentially...
var stream = FinalizeAndSaveDocument(document, exportParams);
var stream = FinalizeAndSaveDocument(document, exportParams, producer);
if (progress.IsCancellationRequested) return false;

return MergePassthroughPages(stream, output, pdfPages, exportParams, progress);
Expand Down Expand Up @@ -296,7 +298,8 @@ private PageExportState WriteToPdfSharpStep(PageExportState state)
return state;
}

private static MemoryStream FinalizeAndSaveDocument(PdfDocument document, PdfExportParams exportParams)
private static MemoryStream FinalizeAndSaveDocument(PdfDocument document, PdfExportParams exportParams,
string producer)
{
var compat = exportParams.Compat;
var now = DateTime.Now;
Expand All @@ -312,7 +315,7 @@ private static MemoryStream FinalizeAndSaveDocument(PdfDocument document, PdfExp
{
PdfAHelper.SetColorProfile(document);
PdfAHelper.SetCidMap(document);
PdfAHelper.CreateXmpMetadata(document, compat);
PdfAHelper.CreateXmpMetadata(document, compat, producer);
}

document.Version = compat switch
Expand Down Expand Up @@ -465,7 +468,8 @@ private static IEnumerable<TextDrawInfo> GetOcrTextToDraw(PdfPage page, OcrResul
while (true)
{
var font = new XFont(lineFontFamily, lineFontSize + 1, XFontStyle.Regular);
if (eligibleWords.All(word => gfx.MeasureString(word.Text, font).Width < word.Bounds.w * hAdjust))
if (eligibleWords.All(
word => gfx.MeasureString(word.Text, font).Width < word.Bounds.w * hAdjust))
{
lineFontSize++;
}
Expand Down
2 changes: 2 additions & 0 deletions NAPS2.Sdk/Pdf/Pdfium/PdfDocument.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ private PdfDocument(IntPtr handle, IDisposable? readLock = null) : base(handle)

public int PageCount => Native.FPDF_GetPageCount(Handle);

public int? Version => Native.FPDF_GetFileVersion(Handle, out int version) ? version : null;

public PdfPage GetPage(int pageIndex)
{
return new PdfPage(Native.FPDF_LoadPage(Handle, pageIndex), this, pageIndex);
Expand Down
3 changes: 3 additions & 0 deletions NAPS2.Sdk/Pdf/Pdfium/PdfiumNativeLibrary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ public delegate void FPDFBitmap_FillRect_delegate(IntPtr bitmap, int left, int t

public delegate bool FPDF_SaveAsCopy_delegate(IntPtr document, ref FPDF_FileWrite fileWrite, int flags);

public delegate bool FPDF_GetFileVersion_delegate(IntPtr document, out int fileVersion);

public delegate IntPtr FPDF_GetMetaText_delegate(IntPtr document, [MarshalAs(UnmanagedType.LPStr)] string tag,
byte[]? buffer, IntPtr buflen);

Expand Down Expand Up @@ -214,6 +216,7 @@ public delegate void FPDF_FFLDraw_delegate(IntPtr handle, IntPtr bitmap, IntPtr
public FPDF_LoadMemDocument_delegate FPDF_LoadMemDocument => Load<FPDF_LoadMemDocument_delegate>();
public FPDF_CloseDocument_delegate FPDF_CloseDocument => Load<FPDF_CloseDocument_delegate>();
public FPDF_SaveAsCopy_delegate FPDF_SaveAsCopy => Load<FPDF_SaveAsCopy_delegate>();
public FPDF_GetFileVersion_delegate FPDF_GetFileVersion => Load<FPDF_GetFileVersion_delegate>();
public FPDF_GetMetaText_delegate FPDF_GetMetaText => Load<FPDF_GetMetaText_delegate>();
public FPDF_GetPageCount_delegate FPDF_GetPageCount => Load<FPDF_GetPageCount_delegate>();
public FPDF_LoadPage_delegate FPDF_LoadPage => Load<FPDF_LoadPage_delegate>();
Expand Down

0 comments on commit 3490a52

Please sign in to comment.