Skip to content

Commit 5f59145

Browse files
authored
Merge pull request #247 from kit-data-manager/generalize-reading-and-writing-crates
Generalize reading and writing crates
2 parents 0ab3f3c + 3cfda3a commit 5f59145

39 files changed

Lines changed: 1548 additions & 1880 deletions
Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
package edu.kit.datamanager.ro_crate.reader;
2+
3+
import com.fasterxml.jackson.databind.JsonNode;
4+
import com.fasterxml.jackson.databind.node.ArrayNode;
5+
import com.fasterxml.jackson.databind.node.ObjectNode;
6+
import edu.kit.datamanager.ro_crate.RoCrate;
7+
import edu.kit.datamanager.ro_crate.context.CrateMetadataContext;
8+
import edu.kit.datamanager.ro_crate.context.RoCrateMetadataContext;
9+
import edu.kit.datamanager.ro_crate.entities.contextual.ContextualEntity;
10+
import edu.kit.datamanager.ro_crate.entities.data.DataEntity;
11+
import edu.kit.datamanager.ro_crate.entities.data.RootDataEntity;
12+
import edu.kit.datamanager.ro_crate.special.IdentifierUtils;
13+
import edu.kit.datamanager.ro_crate.special.JsonUtilFunctions;
14+
import edu.kit.datamanager.ro_crate.validation.JsonSchemaValidation;
15+
import edu.kit.datamanager.ro_crate.validation.Validator;
16+
import org.slf4j.Logger;
17+
import org.slf4j.LoggerFactory;
18+
19+
import java.io.File;
20+
import java.util.*;
21+
import java.util.stream.Collectors;
22+
import java.util.stream.StreamSupport;
23+
24+
/**
25+
* This class allows reading crates from the outside into the library in order
26+
* to inspect or modify it.
27+
* <p>
28+
* The constructor takes a strategy to support different ways of importing the
29+
* crates. (from zip, folder, etc.).
30+
* <p>
31+
* The reader consideres "hasPart" and "isPartOf" properties and considers all
32+
* entities (in-)directly connected to the root entity ("./") as DataEntities.
33+
*
34+
* @param <T> the type of the location parameter
35+
*/
36+
public class CrateReader<T> {
37+
38+
private static final Logger logger = LoggerFactory.getLogger(CrateReader.class);
39+
40+
/**
41+
* This is a private inner class that shall not be exposed. **Do not make it
42+
* public or protected.** It serves only the purpose of unsafe operations
43+
* while reading a crate and may be specific to this implementation.
44+
*/
45+
private static class RoCrateUnsafe extends RoCrate {
46+
47+
public void addDataEntityWithoutRootHasPart(DataEntity entity) {
48+
this.metadataContext.checkEntity(entity);
49+
this.roCratePayload.addDataEntity(entity);
50+
}
51+
}
52+
53+
/**
54+
* If the number of JSON entities in the crate is larger than this number,
55+
* parallelization will be used.
56+
*/
57+
private static final int PARALLELIZATION_THRESHOLD = 100;
58+
59+
private static final String FILE_PREVIEW_FILES = "ro-crate-preview_files";
60+
private static final String FILE_PREVIEW_HTML = "ro-crate-preview.html";
61+
private static final String FILE_METADATA_JSON = "ro-crate-metadata.json";
62+
63+
protected static final String SPECIFICATION_PREFIX = "https://w3id.org/ro/crate/";
64+
65+
protected static final String PROP_ABOUT = "about";
66+
protected static final String PROP_CONTEXT = "@context";
67+
protected static final String PROP_CONFORMS_TO = "conformsTo";
68+
protected static final String PROP_GRAPH = "@graph";
69+
protected static final String PROP_HAS_PART = "hasPart";
70+
protected static final String PROP_ID = "@id";
71+
72+
private final GenericReaderStrategy<T> strategy;
73+
74+
public CrateReader(GenericReaderStrategy<T> strategy) {
75+
this.strategy = strategy;
76+
}
77+
78+
/**
79+
* This function will read the location (using one of the specified
80+
* strategies) and then build the relation between the entities.
81+
*
82+
* @param location the location of the ro-crate to be read
83+
* @return the read RO-crate
84+
*/
85+
public RoCrate readCrate(T location) {
86+
// get the ro-crate-metadata.json
87+
ObjectNode metadataJson = strategy.readMetadataJson(location);
88+
// get the content of the crate
89+
File files = strategy.readContent(location);
90+
91+
// this set will contain the files that are associated with entities
92+
HashSet<String> usedFiles = new HashSet<>();
93+
usedFiles.add(files.toPath().resolve(FILE_METADATA_JSON).toFile().getPath());
94+
usedFiles.add(files.toPath().resolve(FILE_PREVIEW_HTML).toFile().getPath());
95+
usedFiles.add(files.toPath().resolve(FILE_PREVIEW_FILES).toFile().getPath());
96+
return rebuildCrate(metadataJson, files, usedFiles);
97+
}
98+
99+
private RoCrate rebuildCrate(ObjectNode metadataJson, File files, HashSet<String> usedFiles) {
100+
if (metadataJson == null) {
101+
logger.error("Metadata JSON is null, cannot rebuild crate");
102+
return null;
103+
}
104+
if (files == null) {
105+
logger.error("Content files directory is null, cannot rebuild crate");
106+
return null;
107+
}
108+
JsonNode context = metadataJson.get(PROP_CONTEXT);
109+
110+
CrateMetadataContext crateContext = new RoCrateMetadataContext(context);
111+
RoCrateUnsafe crate = new RoCrateUnsafe();
112+
crate.setMetadataContext(crateContext);
113+
JsonNode graph = metadataJson.get(PROP_GRAPH);
114+
115+
if (graph.isArray()) {
116+
moveRootEntitiesFromGraphToCrate(crate, (ArrayNode) graph);
117+
RootDataEntity root = crate.getRootDataEntity();
118+
if (root != null) {
119+
Set<String> dataEntityIds = getDataEntityIds(root, graph);
120+
for (JsonNode entityJson : graph) {
121+
String eId = unpackId(entityJson);
122+
if (dataEntityIds.contains(eId)) {
123+
// data entity
124+
DataEntity.DataEntityBuilder dataEntity = new DataEntity.DataEntityBuilder()
125+
.setAll(entityJson.deepCopy());
126+
127+
// Handle data entities with corresponding file
128+
checkFolderHasFile(entityJson.get(PROP_ID).asText(), files).ifPresent(file -> {
129+
usedFiles.add(file.getPath());
130+
dataEntity.setLocationWithExceptions(file.toPath())
131+
.setId(file.getName());
132+
});
133+
134+
crate.addDataEntityWithoutRootHasPart(dataEntity.build());
135+
} else {
136+
// contextual entity
137+
crate.addContextualEntity(
138+
new ContextualEntity.ContextualEntityBuilder()
139+
.setAll(entityJson.deepCopy())
140+
.build());
141+
}
142+
}
143+
}
144+
}
145+
146+
Collection<File> untrackedFiles = Arrays.stream(
147+
Optional.ofNullable(files.listFiles()).orElse(new File[0]))
148+
.filter(f -> !usedFiles.contains(f.getPath()))
149+
.collect(Collectors.toSet());
150+
151+
crate.setUntrackedFiles(untrackedFiles);
152+
Validator defaultValidation = new Validator(new JsonSchemaValidation());
153+
defaultValidation.validate(crate);
154+
return crate;
155+
}
156+
157+
/**
158+
* Extracts graph connections from top to bottom.
159+
* <p>
160+
* Example: (connections.get(parent) -> children)
161+
*
162+
* @param graph the ArrayNode with all Entities.
163+
* @return the graph connections.
164+
*/
165+
protected Map<String, Set<String>> makeEntityGraph(JsonNode graph) {
166+
Map<String, Set<String>> connections = new HashMap<>();
167+
168+
Map<String, JsonNode> idToNodes = new HashMap<>();
169+
StreamSupport.stream(graph.spliterator(), false)
170+
.forEach(jsonNode -> idToNodes.put(unpackId(jsonNode), jsonNode));
171+
172+
for (JsonNode entityNode : graph) {
173+
String currentId = unpackId(entityNode);
174+
StreamSupport.stream(entityNode.path("hasPart").spliterator(), false)
175+
.map(this::unpackId)
176+
.map(s -> idToNodes.getOrDefault(s, null))
177+
.filter(Objects::nonNull)
178+
.forEach(child -> connections.computeIfAbsent(currentId, key -> new HashSet<>())
179+
.add(unpackId(child)));
180+
StreamSupport.stream(entityNode.path("isPartOf").spliterator(), false)
181+
.map(this::unpackId)
182+
.map(s -> idToNodes.getOrDefault(s, null))
183+
.filter(Objects::nonNull)
184+
.forEach(parent -> connections.computeIfAbsent(unpackId(parent), key -> new HashSet<>())
185+
.add(currentId));
186+
}
187+
return connections;
188+
}
189+
190+
protected Set<String> getDataEntityIds(RootDataEntity root, JsonNode graph) {
191+
if (root == null) {
192+
return Set.of();
193+
}
194+
Map<String, Set<String>> network = makeEntityGraph(graph);
195+
Set<String> directDataEntities = new HashSet<>(root.hasPart);
196+
197+
Stack<String> processingQueue = new Stack<>();
198+
processingQueue.addAll(directDataEntities);
199+
Set<String> result = new HashSet<>();
200+
201+
while (!processingQueue.empty()) {
202+
String currentId = processingQueue.pop();
203+
result.add(currentId);
204+
network.getOrDefault(currentId, new HashSet<>()).stream()
205+
.filter(subId -> !result.contains(subId)) // avoid loops!
206+
.forEach(subId -> {
207+
result.add(subId);
208+
processingQueue.add(subId);
209+
});
210+
}
211+
return result;
212+
}
213+
214+
protected String unpackId(JsonNode node) {
215+
if (node.isTextual()) {
216+
return node.asText();
217+
} else /*if (node.isObject())*/ {
218+
return node.path(PROP_ID).asText();
219+
}
220+
}
221+
222+
protected Optional<File> checkFolderHasFile(String filepathOrId, File folder) {
223+
if (IdentifierUtils.isUrl(filepathOrId)) {
224+
return Optional.empty();
225+
}
226+
return IdentifierUtils.decode(filepathOrId)
227+
.map(decoded -> folder.toPath().resolve(decoded).toFile())
228+
.filter(File::exists);
229+
}
230+
231+
/**
232+
* Moves the descriptor and the root entity from the graph to the crate.
233+
* <p>
234+
* Extracts the root data entity and the Metadata File Descriptor from the
235+
* graph and inserts them into the crate object. It also deletes it from the
236+
* graph. We will need the root dataset to distinguish between data entities
237+
* and contextual entities.
238+
*
239+
* @param crate the crate, which will receive the entities, if available in
240+
* the graph.
241+
* @param graph the graph of the Metadata JSON file, where the entities are
242+
* extracted and removed from.
243+
*/
244+
protected void moveRootEntitiesFromGraphToCrate(RoCrate crate, ArrayNode graph) {
245+
Optional<JsonNode> maybeDescriptor = getMetadataDescriptor(graph);
246+
247+
maybeDescriptor.ifPresent(descriptor -> {
248+
setCrateDescriptor(crate, descriptor);
249+
JsonUtilFunctions.removeJsonNodeFromArrayNode(graph, descriptor);
250+
251+
Optional<ObjectNode> maybeRoot = extractRoot(graph, descriptor);
252+
253+
maybeRoot.ifPresent(root -> {
254+
Set<String> hasPartIds = extractHasPartIds(root);
255+
256+
crate.setRootDataEntity(
257+
new RootDataEntity.RootDataEntityBuilder()
258+
.setAll(root.deepCopy())
259+
.setHasPart(hasPartIds)
260+
.build());
261+
262+
JsonUtilFunctions.removeJsonNodeFromArrayNode(graph, root);
263+
});
264+
});
265+
}
266+
267+
/**
268+
* Find the metadata descriptor.
269+
* <p>
270+
* Currently prefers algorithm of version 1.1 over the one of 1.2-DRAFT.
271+
*
272+
* @param graph the graph to search the descriptor in.
273+
* @return the metadata descriptor of the crate.
274+
*/
275+
protected Optional<JsonNode> getMetadataDescriptor(ArrayNode graph) {
276+
boolean isParallel = graph.size() > PARALLELIZATION_THRESHOLD;
277+
// use the algorithm described here:
278+
// https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
279+
Optional<JsonNode> maybeDescriptor = StreamSupport.stream(graph.spliterator(), isParallel)
280+
// "2. if the conformsTo property is a URI that starts with
281+
// https://w3id.org/ro/crate/"
282+
.filter(node -> node.path(PROP_CONFORMS_TO).path(PROP_ID).asText().startsWith(SPECIFICATION_PREFIX))
283+
// "3. from this entity’s about object keep the @id URI as variable root"
284+
.filter(node -> node.path(PROP_ABOUT).path(PROP_ID).isTextual())
285+
// There should be only one descriptor. If multiple exist, we take the first
286+
// one.
287+
.findFirst();
288+
return maybeDescriptor.or(()
289+
-> // from https://www.researchobject.org/ro-crate/1.2-DRAFT/root-data-entity.html#finding-the-root-data-entity
290+
StreamSupport.stream(graph.spliterator(), isParallel)
291+
.filter(node -> node.path(PROP_ID).asText().equals(FILE_METADATA_JSON))
292+
.findFirst()
293+
);
294+
}
295+
296+
/**
297+
* Extracts the root entity from the graph, using the information from the
298+
* descriptor.
299+
* <p>
300+
* Basically implements step 5 of the algorithm described here:
301+
* <a href="https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity">
302+
* https://www.researchobject.org/ro-crate/1.1/root-data-entity.html#finding-the-root-data-entity
303+
* </a>
304+
*
305+
* @param graph the graph from the metadata JSON-LD file
306+
* @param descriptor the RO-Crate descriptor
307+
* @return the root entity, if found
308+
*/
309+
private Optional<ObjectNode> extractRoot(ArrayNode graph, JsonNode descriptor) {
310+
String rootId = descriptor.get(PROP_ABOUT).get(PROP_ID).asText();
311+
boolean isParallel = graph.size() > PARALLELIZATION_THRESHOLD;
312+
return StreamSupport.stream(graph.spliterator(), isParallel)
313+
// root is an object (filter + conversion)
314+
.filter(JsonNode::isObject)
315+
.map(JsonNode::<ObjectNode>deepCopy)
316+
// "5. if the entity has an @id URI that matches root return it"
317+
.filter(node -> node.path(PROP_ID).asText().equals(rootId))
318+
.findFirst();
319+
}
320+
321+
private Set<String> extractHasPartIds(ObjectNode root) {
322+
JsonNode hasPartNode = root.path(PROP_HAS_PART);
323+
boolean isParallel = hasPartNode.isArray() && hasPartNode.size() > PARALLELIZATION_THRESHOLD;
324+
Set<String> hasPartIds = StreamSupport.stream(hasPartNode.spliterator(), isParallel)
325+
.map(hasPart -> hasPart.path(PROP_ID).asText())
326+
.filter(text -> !text.isBlank())
327+
.collect(Collectors.toSet());
328+
if (hasPartIds.isEmpty() && hasPartNode.path(PROP_ID).isTextual()) {
329+
hasPartIds.add(hasPartNode.path(PROP_ID).asText());
330+
}
331+
return hasPartIds;
332+
}
333+
334+
private void setCrateDescriptor(RoCrate crate, JsonNode descriptor) {
335+
ContextualEntity descriptorEntity = new ContextualEntity.ContextualEntityBuilder()
336+
.setAll(descriptor.deepCopy())
337+
.build();
338+
crate.setJsonDescriptor(descriptorEntity);
339+
}
340+
}
Lines changed: 4 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,12 @@
11
package edu.kit.datamanager.ro_crate.reader;
22

3-
import com.fasterxml.jackson.databind.ObjectMapper;
4-
import com.fasterxml.jackson.databind.node.ObjectNode;
5-
6-
import edu.kit.datamanager.ro_crate.objectmapper.MyObjectMapper;
7-
8-
import java.io.File;
9-
import java.io.IOException;
10-
import java.nio.file.Path;
11-
123
/**
134
* A class for reading a crate from a folder.
145
*
156
* @author Nikola Tzotchev on 9.2.2022 г.
167
* @version 1
8+
*
9+
* @deprecated Use {@link FolderStrategy} instead.
1710
*/
18-
public class FolderReader implements ReaderStrategy {
19-
20-
@Override
21-
public ObjectNode readMetadataJson(String location) {
22-
Path metadata = new File(location).toPath().resolve("ro-crate-metadata.json");
23-
ObjectMapper objectMapper = MyObjectMapper.getMapper();
24-
ObjectNode objectNode = objectMapper.createObjectNode();
25-
try {
26-
objectNode = objectMapper.readTree(metadata.toFile()).deepCopy();
27-
} catch (IOException e) {
28-
e.printStackTrace();
29-
}
30-
return objectNode;
31-
}
32-
33-
@Override
34-
public File readContent(String location) {
35-
return new File(location);
36-
}
37-
}
11+
@Deprecated(since = "2.1.0", forRemoval = true)
12+
public class FolderReader extends FolderStrategy {}

0 commit comments

Comments
 (0)