Skip to content

Commit

Permalink
Merge pull request #1170 from digital-preservation/gz-container-ident…
Browse files Browse the repository at this point in the history
…ification

Add GZIP container identification.
  • Loading branch information
MancunianSam authored Mar 6, 2025
2 parents baeecf2 + be35617 commit fba0b3a
Show file tree
Hide file tree
Showing 19 changed files with 645 additions and 136 deletions.
26 changes: 21 additions & 5 deletions droid-api/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,26 @@
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>jakarta.xml.bind</groupId>
<artifactId>jakarta.xml.bind-api</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>xml-apis</groupId>
<artifactId>xml-apis</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.5</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@
import java.io.InputStream;
import java.nio.file.Path;

import uk.gov.nationalarchives.droid.container.ContainerFileIdentificationRequestFactory;
import uk.gov.nationalarchives.droid.container.ContainerSignatureFileReader;
import uk.gov.nationalarchives.droid.container.IdentifierEngine;
import uk.gov.nationalarchives.droid.container.*;
import uk.gov.nationalarchives.droid.container.gz.GzIdentifier;
import uk.gov.nationalarchives.droid.container.gz.GzIdentifierEngine;
import uk.gov.nationalarchives.droid.container.ole2.Ole2Identifier;
import uk.gov.nationalarchives.droid.container.ole2.Ole2IdentifierEngine;
import uk.gov.nationalarchives.droid.container.zip.ZipIdentifier;
Expand Down Expand Up @@ -70,12 +70,24 @@ private IdentificationRequestFactory<InputStream> requestFactory() {
return new ContainerFileIdentificationRequestFactory();
}

private IdentifierEngine identifierEngine() {
private IdentifierEngine zipIdentifierEngine() {
ZipIdentifierEngine engine = new ZipIdentifierEngine();
engine.setRequestFactory(requestFactory());
return engine;
}

private IdentifierEngine gzIdentifierEngine() {
GzIdentifierEngine engine = new GzIdentifierEngine();
engine.setRequestFactory(requestFactory());
return engine;
}

private Ole2IdentifierEngine ole2IdentifierEngine() {
Ole2IdentifierEngine engine = new Ole2IdentifierEngine();
engine.setRequestFactory(requestFactory());
return engine;
}

private ArchiveFormatResolver archiveFormatResolver() {
return new ArchiveFormatResolverImpl();
}
Expand All @@ -84,42 +96,29 @@ private ContainerIdentifierFactory identifierFactory() {
return new ContainerIdentifierFactoryImpl();
}

public ZipIdentifier zipIdentifier() {
ZipIdentifier zip = new ZipIdentifier();
zip.setContainerType("ZIP");
zip.setContainerIdentifierFactory(identifierFactory());
zip.setContainerFormatResolver(archiveFormatResolver());
zip.setDroidCore(droid);
zip.setIdentifierEngine(identifierEngine());
zip.setSignatureReader(signatureReader());

try {
zip.init();
} catch (SignatureFileException e) {
throw new RuntimeException("Unable to init zip identifier", e);
}
return zip;
public GzIdentifier gzIdentifier() {
return initialiseContainerIdentifier(new GzIdentifier(), gzIdentifierEngine());
}

private Ole2IdentifierEngine ole2IdentifierEngine() {
Ole2IdentifierEngine engine = new Ole2IdentifierEngine();
engine.setRequestFactory(requestFactory());
return engine;
public ZipIdentifier zipIdentifier() {
return initialiseContainerIdentifier(new ZipIdentifier(), zipIdentifierEngine());
}

public Ole2Identifier ole2Identifier() {
Ole2Identifier ole2 = new Ole2Identifier();
ole2.setContainerType("OLE2");
ole2.setContainerIdentifierFactory(identifierFactory());
ole2.setContainerFormatResolver(archiveFormatResolver());
ole2.setDroidCore(droid);
ole2.setIdentifierEngine(ole2IdentifierEngine());
ole2.setSignatureReader(signatureReader());
return initialiseContainerIdentifier(new Ole2Identifier(), ole2IdentifierEngine());
}

private <T extends AbstractContainerIdentifier, U extends IdentifierEngine> T initialiseContainerIdentifier(T containerIdentifier, U identifierEngine) {
containerIdentifier.setContainerIdentifierFactory(identifierFactory());
containerIdentifier.setContainerFormatResolver(archiveFormatResolver());
containerIdentifier.setDroidCore(droid);
containerIdentifier.setIdentifierEngine(identifierEngine);
containerIdentifier.setSignatureReader(signatureReader());
try {
ole2.init();
containerIdentifier.init();
} catch (SignatureFileException ex) {
throw new RuntimeException("Unable to init Ole2Identifier", ex);
throw new RuntimeException("Unable to init " + containerIdentifier.getClass().getSimpleName(), ex);
}
return ole2;
return containerIdentifier;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import java.nio.file.Files;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.ResourceBundle;
Expand Down Expand Up @@ -73,6 +74,7 @@ public final class DroidAPI {

private static final String ZIP_PUID = "x-fmt/263";
private static final String OLE2_PUID = "fmt/111";
private static final String GZIP_PUID = "x-fmt/266";

private static final AtomicLong ID_GENERATOR = new AtomicLong();

Expand All @@ -82,16 +84,19 @@ public final class DroidAPI {

private final ContainerIdentifier ole2Identifier;

private final ContainerIdentifier gzIdentifier;

private final String containerSignatureVersion;

private final String binarySignatureVersion;

private final String droidVersion;

private DroidAPI(DroidCore droidCore, ContainerIdentifier zipIdentifier, ContainerIdentifier ole2Identifier, String containerSignatureVersion, String binarySignatureVersion, String droidVersion) {
private DroidAPI(DroidCore droidCore, ContainerIdentifier zipIdentifier, ContainerIdentifier ole2Identifier, ContainerIdentifier gzIdentifier, String containerSignatureVersion, String binarySignatureVersion, String droidVersion) {
this.droidCore = droidCore;
this.zipIdentifier = zipIdentifier;
this.ole2Identifier = ole2Identifier;
this.gzIdentifier = gzIdentifier;
this.containerSignatureVersion = containerSignatureVersion;
this.binarySignatureVersion = binarySignatureVersion;
this.droidVersion = droidVersion;
Expand All @@ -114,7 +119,7 @@ public static DroidAPI getInstance(final Path binarySignature, final Path contai
String containerVersion = StringUtils.substringAfterLast(containerSignature.getFileName().toString(), "-").split("\\.")[0];
String droidVersion = ResourceBundle.getBundle("options").getString("version_no");
ContainerApi containerApi = new ContainerApi(droidCore, containerSignature);
return new DroidAPI(droidCore, containerApi.zipIdentifier(), containerApi.ole2Identifier(), containerVersion, droidCore.getSigFile().getVersion(), droidVersion);
return new DroidAPI(droidCore, containerApi.zipIdentifier(), containerApi.ole2Identifier(), containerApi.gzIdentifier(), containerVersion, droidCore.getSigFile().getVersion(), droidVersion);
}

/**
Expand Down Expand Up @@ -180,25 +185,20 @@ private IdentificationResultCollection identifyByExtension(final FileSystemIdent
}

private Optional<String> getContainerPuid(final IdentificationResultCollection binaryResult) {
return binaryResult.getResults().stream().filter(x ->
ZIP_PUID.equals(x.getPuid()) || OLE2_PUID.equals(x.getPuid())
).map(IdentificationResult::getPuid).findFirst();
List<String> containerPuids = Arrays.asList(ZIP_PUID, OLE2_PUID, GZIP_PUID);
return binaryResult.getResults().stream()
.map(IdentificationResult::getPuid)
.filter(containerPuids::contains).findFirst();
}

private IdentificationResultCollection handleContainer(final IdentificationResultCollection binaryResult,
final FileSystemIdentificationRequest identificationRequest, final String containerPuid) throws IOException {
ContainerIdentifier identifier;

switch (containerPuid) {
case ZIP_PUID:
identifier = zipIdentifier;
break;
case OLE2_PUID:
identifier = ole2Identifier;
break;
default:
throw new RuntimeException("Unknown container PUID : " + containerPuid);
}
ContainerIdentifier identifier = switch (containerPuid) {
case ZIP_PUID -> zipIdentifier;
case OLE2_PUID -> ole2Identifier;
case GZIP_PUID -> gzIdentifier;
default -> throw new RuntimeException("Unknown container PUID : " + containerPuid);
};

IdentificationResultCollection containerResults = identifier.submit(identificationRequest);
droidCore.removeLowerPriorityHits(containerResults);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@
import org.junit.Test;
import uk.gov.nationalarchives.droid.core.SignatureParseException;
import uk.gov.nationalarchives.droid.core.interfaces.IdentificationMethod;

import uk.gov.nationalarchives.droid.internal.api.DroidAPITestUtils.ContainerType;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.Optional;
import java.util.ResourceBundle;
import java.util.stream.Collectors;

Expand All @@ -47,6 +49,7 @@
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.notNullValue;
import static uk.gov.nationalarchives.droid.internal.api.DroidAPITestUtils.*;

public class DroidAPITest {

Expand All @@ -62,6 +65,66 @@ public void should_create_non_null_instance_using_test_utility_class() {
assertThat(api, is(notNullValue()));
}

@Test
public void should_match_gzip_container_file() {
String data = "TEST";
ContainerType containerType = new ContainerType("GZIP", generateId(),"x-fmt/266");
DroidAPI api = DroidAPITestUtils.createApiForContainer(new DroidAPITestUtils.ContainerFile(containerType, data, "fmt/12345", Optional.empty()));
try {
List<ApiResult> results = api.submit(DroidAPITestUtils.generateGzFile(data));
assertThat(results, hasSize(1));
assertThat(results.getFirst().getPuid(), is("fmt/12345"));
assertThat(results.getFirst().getMethod(), is(IdentificationMethod.CONTAINER));
} catch (IOException e) {
throw new RuntimeException(e);
}
}

@Test
public void should_match_zip_container_file() {
String data = "TEST";
ContainerType containerType = new ContainerType("ZIP", generateId(),"x-fmt/263");
DroidAPI api = DroidAPITestUtils.createApiForContainer(new DroidAPITestUtils.ContainerFile(containerType, data, "fmt/12345", Optional.of(data)));
try {
List<ApiResult> results = api.submit(DroidAPITestUtils.generateZipFile(data, data));
assertThat(results, hasSize(1));
assertThat(results.getFirst().getPuid(), is("fmt/12345"));
assertThat(results.getFirst().getMethod(), is(IdentificationMethod.CONTAINER));
} catch (IOException e) {
throw new RuntimeException(e);
}
}

@Test
public void should_match_ole2_container_file() {
String data = "TEST";
ContainerType containerType = new ContainerType("OLE2", generateId(),"fmt/111");
DroidAPI api = DroidAPITestUtils.createApiForContainer(new DroidAPITestUtils.ContainerFile(containerType, data, "fmt/12345", Optional.of(data)));
try {
List<ApiResult> results = api.submit(DroidAPITestUtils.generateOle2File(data, data));
assertThat(results, hasSize(1));
assertThat(results.getFirst().getPuid(), is("fmt/12345"));
assertThat(results.getFirst().getMethod(), is(IdentificationMethod.CONTAINER));
} catch (IOException e) {
throw new RuntimeException(e);
}
}

@Test(expected = IOException.class)
public void should_throw_an_exception_if_file_cannot_be_read() throws IOException {
api.submit(Path.of("/invalidpath"));
}

@Test(expected = RuntimeException.class)
public void should_throw_an_exception_if_container_file_cannot_be_read() throws SignatureParseException {
DroidAPI.getInstance(signaturePath, Path.of("/invalidContainerPath"));
}

@Test(expected = SignatureParseException.class)
public void should_throw_an_exception_if_signature_file_cannot_be_read() throws SignatureParseException {
DroidAPI.getInstance(Path.of("/invalidSignaturePath"), containerPath);
}

@Test
public void should_identify_given_file_with_binary_signature() throws IOException {
List<ApiResult> results = api.submit(
Expand All @@ -70,7 +133,7 @@ public void should_identify_given_file_with_binary_signature() throws IOExceptio

assertThat(results.size(), is(1));

ApiResult identificationResult = results.get(0);
ApiResult identificationResult = results.getFirst();

assertThat(identificationResult.getPuid(), is("x-fmt/263"));
assertThat(identificationResult.getName(), is("ZIP Format"));
Expand All @@ -85,7 +148,7 @@ public void should_identify_given_file_using_container_signature() throws IOExce

assertThat(results.size(), is(1));

ApiResult identificationResult = results.get(0);
ApiResult identificationResult = results.getFirst();

assertThat(identificationResult.getPuid(), is("fmt/291"));
assertThat(identificationResult.getName(), is("OpenDocument Text"));
Expand All @@ -98,7 +161,7 @@ public void should_identify_given_file_using_file_extension() throws IOException
assertThat(results, is(notNullValue()));
assertThat(results, hasSize(1));

ApiResult singleResult = results.get(0);
ApiResult singleResult = results.getFirst();

assertThat(singleResult.getPuid(), is("x-fmt/111"));
assertThat(singleResult.getMethod(), is(IdentificationMethod.EXTENSION));
Expand All @@ -109,8 +172,8 @@ public void should_report_extension_of_the_file_under_identification_test() thro
List<ApiResult> resultsWithExtension = api.submit(Paths.get("src/test/resources/test.txt"));
List<ApiResult> resultsWithoutExtension = api.submit(Paths.get("src/test/resources/word97"));

assertThat(resultsWithExtension.get(0).getExtension(), is("txt"));
assertThat(resultsWithoutExtension.get(0).getExtension(), is(""));
assertThat(resultsWithExtension.getFirst().getExtension(), is("txt"));
assertThat(resultsWithoutExtension.getFirst().getExtension(), is(""));
}

@Test
Expand All @@ -127,8 +190,8 @@ public void should_report_all_puids_when_there_are_more_than_one_identification_
public void should_report_when_there_is_an_extension_mismatch() throws IOException {
List<ApiResult> results = api.submit(Paths.get("src/test/resources/docx-file-as-xls.xlsx"));
assertThat(results.size(), is(1));
assertThat(results.get(0).getPuid(), is("fmt/412"));
assertThat(results.get(0).isFileExtensionMismatch(), is(true));
assertThat(results.getFirst().getPuid(), is("fmt/412"));
assertThat(results.getFirst().isFileExtensionMismatch(), is(true));
}

@Test
Expand Down Expand Up @@ -160,6 +223,6 @@ public void should_produce_results_for_every_time_a_file_is_submitted_for_identi
public void should_identify_fmt_40_correctly_with_container_identification_method() throws IOException {
List<ApiResult> results = api.submit(
Paths.get("../droid-container/src/test/resources/word97.doc"));
assertThat(results.get(0).getName(), is("Microsoft Word Document"));
assertThat(results.getFirst().getName(), is("Microsoft Word Document"));
}
}
Loading

0 comments on commit fba0b3a

Please sign in to comment.