Skip to content

Commit b66b171

Browse files
authored
Merge pull request #184 from kit-data-manager/fix-49-data-entities-without-hasPart
Avoid adding dataEntities which are not being added to the root data entity
2 parents 4dd1095 + c9de0c0 commit b66b171

File tree

10 files changed

+221
-69
lines changed

10 files changed

+221
-69
lines changed

src/main/java/edu/kit/datamanager/ro_crate/Crate.java

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public interface Crate {
2424
/**
2525
* Read version from the crate descriptor and return it as a class
2626
* representation.
27-
*
27+
* <p>
2828
* NOTE: If there is no version in the crate, it does not comply with the
2929
* specification.
3030
*
@@ -36,9 +36,9 @@ public interface Crate {
3636
/**
3737
* Returns strings indicating the conformance of a crate with other
3838
* specifications than the RO-Crate version.
39-
*
39+
* <p>
4040
* If you need the crate version too, refer to {@link #getVersion()}.
41-
*
41+
* <p>
4242
* This corresponds technically to all conformsTo values, excluding the RO crate
4343
* version / specification.
4444
*
@@ -71,7 +71,12 @@ public interface Crate {
7171

7272
AbstractEntity getEntityById(java.lang.String id);
7373

74-
void addDataEntity(DataEntity entity, Boolean toHasPart);
74+
/**
75+
* Adds a data entity to the crate.
76+
*
77+
* @param entity the DataEntity to add to this crate.
78+
*/
79+
void addDataEntity(DataEntity entity);
7580

7681
void addContextualEntity(ContextualEntity entity);
7782

src/main/java/edu/kit/datamanager/ro_crate/RoCrate.java

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030

3131
/**
3232
* The class that represents a single ROCrate.
33-
*
33+
* <p>
3434
* To build or modify it, use a instance of {@link RoCrateBuilder}. In the case
3535
* features of RO-Crate DRAFT specifications are needed, refer to
3636
* {@link BuilderWithDraftFeatures} and its documentation.
@@ -40,13 +40,13 @@
4040
*/
4141
public class RoCrate implements Crate {
4242

43-
private final CratePayload roCratePayload;
44-
private CrateMetadataContext metadataContext;
45-
private CratePreview roCratePreview;
46-
private RootDataEntity rootDataEntity;
47-
private ContextualEntity jsonDescriptor;
43+
protected final CratePayload roCratePayload;
44+
protected CrateMetadataContext metadataContext;
45+
protected CratePreview roCratePreview;
46+
protected RootDataEntity rootDataEntity;
47+
protected ContextualEntity jsonDescriptor;
4848

49-
private Collection<File> untrackedFiles;
49+
protected Collection<File> untrackedFiles;
5050

5151
@Override
5252
public CratePreview getPreview() {
@@ -181,13 +181,19 @@ public AbstractEntity getEntityById(String id) {
181181
return this.roCratePayload.getEntityById(id);
182182
}
183183

184+
/**
185+
* {@inheritDoc}
186+
* <p>
187+
* Note: This will also link the DataEntity to the root node
188+
* using the root nodes hasPart property.
189+
*
190+
* @param entity the DataEntity to add to this crate.
191+
*/
184192
@Override
185-
public void addDataEntity(DataEntity entity, Boolean toHasPart) {
193+
public void addDataEntity(DataEntity entity) {
186194
this.metadataContext.checkEntity(entity);
187195
this.roCratePayload.addDataEntity(entity);
188-
if (Boolean.TRUE.equals(toHasPart)) {
189-
this.rootDataEntity.addToHasPart(entity.getId());
190-
}
196+
this.rootDataEntity.addToHasPart(entity.getId());
191197
}
192198

193199
@Override
@@ -327,10 +333,12 @@ public RoCrateBuilder addDescription(String description) {
327333
}
328334

329335
/**
330-
* Adding a data entity to the crate. The important part here is to also
331-
* add its id to the RootData Entity hasPart.
336+
* Adds a data entity to the crate.
337+
* <p>
338+
* Note: This will also link the DataEntity to the root node
339+
* using the root nodes hasPart property.
332340
*
333-
* @param dataEntity the DataEntity object.
341+
* @param dataEntity the DataEntity to add to this crate.
334342
* @return returns the builder for further usage.
335343
*/
336344
public RoCrateBuilder addDataEntity(DataEntity dataEntity) {
@@ -414,7 +422,7 @@ public RoCrateBuilder addUntrackedFile(File file) {
414422
}
415423

416424
/**
417-
* Returns a crate with the information from this builder.
425+
* @return a crate with the information from this builder.
418426
*/
419427
public RoCrate build() {
420428
return new RoCrate(this);
@@ -424,9 +432,9 @@ public RoCrate build() {
424432
/**
425433
* Builder for Crates, supporting features which are not in a final
426434
* specification yet.
427-
*
435+
* <p>
428436
* NOTE: This will change the specification version of your crate.
429-
*
437+
* <p>
430438
* We only add features we expect to be in the new specification in the end.
431439
* In case a feature will not make it into the specification, we will mark
432440
* it as deprecated and remove it in new major versions. If a feature is
@@ -467,7 +475,7 @@ public BuilderWithDraftFeatures(RoCrate crate) {
467475
/**
468476
* Indicate this crate also conforms to the given specification, in
469477
* addition to the version this builder adds.
470-
*
478+
* <p>
471479
* This is helpful for profiles or other specifications the crate
472480
* conforms to. Can be called multiple times to add more specifications.
473481
*

src/main/java/edu/kit/datamanager/ro_crate/externalproviders/dataentities/ImportFromZenodo.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ private static void addToCrateFromZotero(String url, Crate crate) {
113113
if (entity.get("@id").asText().equals(mainId)) {
114114
var dataEntity = new DataEntity.DataEntityBuilder()
115115
.setAll((ObjectNode) entity).build();
116-
crate.addDataEntity(dataEntity, true);
116+
crate.addDataEntity(dataEntity);
117117
} else {
118118
// here we have to think of a way to differentiate between data and contextual entities.
119119
var contextualEntity = new ContextualEntity.ContextualEntityBuilder()

src/main/java/edu/kit/datamanager/ro_crate/reader/RoCrateReader.java

Lines changed: 115 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -10,29 +10,40 @@
1010
import edu.kit.datamanager.ro_crate.entities.contextual.ContextualEntity;
1111
import edu.kit.datamanager.ro_crate.entities.data.DataEntity;
1212
import edu.kit.datamanager.ro_crate.entities.data.RootDataEntity;
13+
import edu.kit.datamanager.ro_crate.special.IdentifierUtils;
1314
import edu.kit.datamanager.ro_crate.special.JsonUtilFunctions;
14-
import static edu.kit.datamanager.ro_crate.special.IdentifierUtils.decode;
15-
import static edu.kit.datamanager.ro_crate.special.IdentifierUtils.isUrl;
1615

1716
import edu.kit.datamanager.ro_crate.validation.JsonSchemaValidation;
1817
import edu.kit.datamanager.ro_crate.validation.Validator;
19-
import org.apache.commons.io.FileUtils;
2018

2119
import java.io.File;
22-
import java.nio.file.Path;
2320
import java.util.*;
2421
import java.util.stream.Collectors;
22+
import java.util.stream.Stream;
2523
import java.util.stream.StreamSupport;
2624

2725
/**
2826
* This class allows reading crates from the outside into the library in order
2927
* to inspect or modify it.
30-
*
28+
* <p>
3129
* The class takes a strategy to support different ways of importing the crates.
3230
* (from zip, folder, etc.)
3331
*/
3432
public class RoCrateReader {
3533

34+
/**
35+
* This is a private inner class that shall not be exposed.
36+
* **Do not make it public or protected.** It serves only the
37+
* purpose of unsafe operations while reading a crate and
38+
* may be specific to this implementation.
39+
*/
40+
private static class RoCrateUnsafe extends RoCrate {
41+
public void addDataEntityWithoutRootHasPart(DataEntity entity) {
42+
this.metadataContext.checkEntity(entity);
43+
this.roCratePayload.addDataEntity(entity);
44+
}
45+
}
46+
3647
/**
3748
* If the number of JSON entities in the crate is larger than this number,
3849
* parallelization will be used.
@@ -80,34 +91,37 @@ public RoCrate readCrate(String location) {
8091
JsonNode context = metadataJson.get(PROP_CONTEXT);
8192

8293
CrateMetadataContext crateContext = new RoCrateMetadataContext(context);
83-
RoCrate crate = new RoCrate();
94+
RoCrateUnsafe crate = new RoCrateUnsafe();
8495
crate.setMetadataContext(crateContext);
8596
JsonNode graph = metadataJson.get(PROP_GRAPH);
8697

8798
if (graph.isArray()) {
88-
8999
moveRootEntitiesFromGraphToCrate(crate, (ArrayNode) graph);
90-
for (JsonNode node : graph) {
91-
// if the id is in the root hasPart list, we know this entity is a data entity
92-
RootDataEntity root = crate.getRootDataEntity();
93-
if (root != null && root.hasInHasPart(node.get(PROP_ID).asText())) {
94-
// data entity
95-
DataEntity.DataEntityBuilder dataEntity = new DataEntity.DataEntityBuilder()
96-
.setAll(node.deepCopy());
97-
98-
// Handle data entities with corresponding file
99-
File loc = checkFolderHasFile(node.get(PROP_ID).asText(), files);
100-
if (loc != null) {
101-
usedFiles.add(loc.getPath());
102-
dataEntity.setLocationWithExceptions(loc.toPath())
103-
.setId(loc.getName());
104-
}
100+
RootDataEntity root = crate.getRootDataEntity();
101+
if (root != null) {
102+
Set<String> dataEntityIds = getDataEntityIds(root, graph);
103+
for (JsonNode entityJson : graph) {
104+
String eId = unpackId(entityJson);
105+
if (dataEntityIds.contains(eId)) {
106+
// data entity
107+
DataEntity.DataEntityBuilder dataEntity = new DataEntity.DataEntityBuilder()
108+
.setAll(entityJson.deepCopy());
109+
110+
// Handle data entities with corresponding file
111+
checkFolderHasFile(entityJson.get(PROP_ID).asText(), files).ifPresent(file -> {
112+
usedFiles.add(file.getPath());
113+
dataEntity.setLocationWithExceptions(file.toPath())
114+
.setId(file.getName());
115+
});
105116

106-
crate.addDataEntity(dataEntity.build(), false);
107-
} else {
108-
// contextual entity
109-
crate.addContextualEntity(
110-
new ContextualEntity.ContextualEntityBuilder().setAll(node.deepCopy()).build());
117+
crate.addDataEntityWithoutRootHasPart(dataEntity.build());
118+
} else {
119+
// contextual entity
120+
crate.addContextualEntity(
121+
new ContextualEntity.ContextualEntityBuilder()
122+
.setAll(entityJson.deepCopy())
123+
.build());
124+
}
111125
}
112126
}
113127
}
@@ -123,18 +137,79 @@ public RoCrate readCrate(String location) {
123137
return crate;
124138
}
125139

126-
protected File checkFolderHasFile(String id, File file) {
127-
if (isUrl(id)) return null;
128-
Path path = file.toPath().resolve(decode(id).get());
129-
if (path.toFile().exists()) {
130-
return path.toFile();
140+
/**
141+
* Extracts graph connections from top to bottom.
142+
* <p>
143+
* Example: (connections.get(parent) -> children)
144+
*
145+
* @param graph the ArrayNode with all Entities.
146+
* @return the graph connections.
147+
*/
148+
protected Map<String, Set<String>> makeEntityGraph(JsonNode graph) {
149+
Map<String, Set<String>> connections = new HashMap<>();
150+
151+
Map<String, JsonNode> idToNodes = new HashMap<>();
152+
StreamSupport.stream(graph.spliterator(), false)
153+
.forEach(jsonNode -> idToNodes.put(unpackId(jsonNode), jsonNode));
154+
155+
for (JsonNode entityNode : graph) {
156+
String currentId = unpackId(entityNode);
157+
StreamSupport.stream(entityNode.path("hasPart").spliterator(), false)
158+
.map(this::unpackId)
159+
.map(s -> idToNodes.getOrDefault(s, null))
160+
.filter(Objects::nonNull)
161+
.forEach(child -> connections.computeIfAbsent(currentId, key -> new HashSet<>())
162+
.add(unpackId(child)));
163+
StreamSupport.stream(entityNode.path("isPartOf").spliterator(), false)
164+
.map(this::unpackId)
165+
.map(s -> idToNodes.getOrDefault(s, null))
166+
.filter(Objects::nonNull)
167+
.forEach(parent -> connections.computeIfAbsent(unpackId(parent), key -> new HashSet<>())
168+
.add(currentId));
131169
}
132-
return null;
170+
return connections;
171+
}
172+
173+
protected Set<String> getDataEntityIds(RootDataEntity root, JsonNode graph) {
174+
if (root == null) { return Set.of(); }
175+
Map<String, Set<String>> network = makeEntityGraph(graph);
176+
Set<String> directDataEntities = new HashSet<>(root.hasPart);
177+
return Stream.concat(
178+
directDataEntities.stream(),
179+
directDataEntities.stream().flatMap(entity -> getDataEntityIdsRecursive(entity, network))
180+
).collect(Collectors.toSet());
181+
}
182+
183+
protected Stream<String> getDataEntityIdsRecursive(
184+
String parent,
185+
Map<String, Set<String>> network
186+
) {
187+
return Stream.concat(
188+
Stream.of(parent),
189+
network.getOrDefault(parent, new HashSet<>()).stream()
190+
.flatMap(s -> getDataEntityIdsRecursive(s, network))
191+
.filter(Objects::nonNull)
192+
);
193+
}
194+
195+
protected String unpackId(JsonNode node) {
196+
if (node.isTextual()) {
197+
return node.asText();
198+
} else /*if (node.isObject())*/ {
199+
return node.path(PROP_ID).asText();
200+
}
201+
}
202+
203+
protected Optional<File> checkFolderHasFile(String filepathOrId, File folder) {
204+
if (IdentifierUtils.isUrl(filepathOrId)) { return Optional.empty(); }
205+
return IdentifierUtils.decode(filepathOrId)
206+
.map(decoded -> folder.toPath().resolve(decoded).toFile())
207+
.filter(File::exists);
133208
}
134209

135210
/**
136211
* Moves the descriptor and the root entity from the graph to the crate.
137-
*
212+
* <p>
138213
* Extracts the root data entity and the Metadata File Descriptor from the graph
139214
* and inserts them into the crate object. It also deletes it from the graph.
140215
* We will need the root dataset to distinguish between data entities and
@@ -170,7 +245,7 @@ protected void moveRootEntitiesFromGraphToCrate(RoCrate crate, ArrayNode graph)
170245

171246
/**
172247
* Find the metadata descriptor.
173-
*
248+
* <p>
174249
* Currently prefers algorithm of version 1.1 over the one of 1.2-DRAFT.
175250
*
176251
* @param graph the graph to search the descriptor in.
@@ -200,10 +275,12 @@ protected Optional<JsonNode> getMetadataDescriptor(ArrayNode graph) {
200275
/**
201276
* Extracts the root entity from the graph, using the information from the
202277
* descriptor.
203-
*
278+
* <p>
204279
* Basically implements step 5 of the algorithm described here:
205-
* https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
206-
*
280+
* <a href="https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity">
281+
* https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
282+
* </a>
283+
*
207284
* @param graph the graph from the metadata JSON-LD file
208285
* @param descriptor the RO-Crate descriptor
209286
* @return the root entity, if found

src/main/java/edu/kit/datamanager/ro_crate/special/IdentifierUtils.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
/**
1717
* This class defines methods regarding URIs in general, which in RO-Crate
1818
* context means usually a valid, resolvable URL or a relative file path.
19-
*
19+
* <p>
2020
* The purpose is to have a simple abstraction where the way e.g. a URL is
2121
* checked can be changed and tested easily for the whole library.
2222
*/
@@ -28,7 +28,7 @@ private IdentifierUtils() {}
2828
/**
2929
* Returns true, if the given String is encoded and can be used as an
3030
* identifier in RO-Crate.
31-
*
31+
* <p>
3232
* Possible identifiers include:
3333
* - a uri
3434
* - a url
@@ -50,7 +50,9 @@ public static boolean isValidUri(String uri) {
5050
*/
5151
public static boolean isUrl(String uri) {
5252
try {
53-
return asUrl(encode(uri).get()).isPresent();
53+
return encode(uri)
54+
.map(decodedUri -> asUrl(decodedUri).isPresent())
55+
.orElse(false);
5456
} catch (Exception e) {
5557
return false;
5658
}

0 commit comments

Comments
 (0)