From b95ff73a3769edb1717af1e16403a0891f97fa6b Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Sat, 17 Feb 2024 15:29:16 +0100 Subject: [PATCH] Add schema for an "object stream" dataset description This is conceptually already quite close to what we need to get at -- for serializing datasets, starting from their Git repository representation. This is only a minimal sketch at this point, but it demos how the recently found duality of semantic-defining and structure-defining classes would look like in this contex. There are a bunch of open question that need more consideration. Something like: - Should we make the SE-level classes define `exact_mappings` to their ontology counterparts, or use discovery like shown in https://github.com/psychoinformatics-de/datalad-concepts/issues/37 - what kind of value do we use for `meta_type`? CURIEs? or plain class names? - ... --- Makefile | 9 +- .../ContainerSE-DataladDataset-minimal.json | 10 ++ .../ContainerSE-DataladDataset-minimal.rdf | 13 +++ .../ContainerSE-DataladDataset-minimal.yaml | 4 + ...ainerSE-DataladDatasetVersion-linkage.json | 15 +++ ...tainerSE-DataladDatasetVersion-linkage.rdf | 19 ++++ ...ainerSE-DataladDatasetVersion-linkage.yaml | 7 ++ .../DataladDatasetSE-minimal.json | 6 ++ .../DataladDatasetSE-minimal.rdf | 8 ++ .../DataladDatasetSE-minimal.yaml | 3 + .../about.md | 0 src/extra-docs/index.md | 1 + src/linkml/ontology/datalad.yaml | 2 + .../schemas/datalad-dataset-components.yaml | 98 +++++++++++++++++++ src/linkml/schemas/ontology.yaml | 2 +- .../Container-DataladDatasetSE-minimal.yaml | 5 + .../validation/ContainerSE.valid.cfg.yaml | 10 ++ 17 files changed, 209 insertions(+), 3 deletions(-) create mode 100644 src/examples/datalad-dataset-components/ContainerSE-DataladDataset-minimal.json create mode 100644 src/examples/datalad-dataset-components/ContainerSE-DataladDataset-minimal.rdf create mode 100644 src/examples/datalad-dataset-components/ContainerSE-DataladDataset-minimal.yaml create mode 100644 src/examples/datalad-dataset-components/ContainerSE-DataladDatasetVersion-linkage.json create mode 100644 src/examples/datalad-dataset-components/ContainerSE-DataladDatasetVersion-linkage.rdf create mode 100644 src/examples/datalad-dataset-components/ContainerSE-DataladDatasetVersion-linkage.yaml create mode 100644 src/examples/datalad-dataset-components/DataladDatasetSE-minimal.json create mode 100644 src/examples/datalad-dataset-components/DataladDatasetSE-minimal.rdf create mode 100644 src/examples/datalad-dataset-components/DataladDatasetSE-minimal.yaml create mode 100644 src/extra-docs/datalad-dataset-components-schema/about.md create mode 100644 src/linkml/schemas/datalad-dataset-components.yaml create mode 100644 tests/datalad-dataset-components-schema/validation/Container-DataladDatasetSE-minimal.yaml create mode 100644 tests/datalad-dataset-components-schema/validation/ContainerSE.valid.cfg.yaml diff --git a/Makefile b/Makefile index 37e073e..476f010 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ build/context.jsonld: src/linkml/schemas/ontology.yaml build/linkml-docs: \ build/linkml-docs/ontology \ build/linkml-docs/data-access \ + build/linkml-docs/datalad-dataset-components \ build/linkml-docs/datalad-dataset-version # build/linkml-docs/git-provenance-schema build/linkml-docs/%: src/linkml/schemas/%.yaml src/extra-docs/%-schema @@ -33,11 +34,11 @@ build/linkml-docs/%: src/linkml/schemas/%.yaml src/extra-docs/%-schema --diagram-type er_diagram \ --metadata \ --format markdown \ - --example-directory src/examples/$$(basename $@)-schema \ + --example-directory src/examples/$* \ -d $$([ "$*" = "ontology" ] && echo $@ || echo $@-schema) \ $< # try to inject any extra-docs (if any exist) - -cp -r src/extra-docs/$$(basename $@)-schema/*.md $@ + -cp -r src/extra-docs/$*-schema/*.md $@ build/mkdocs-site: build/linkml-docs src/extra-docs/*.md # top-level content @@ -49,6 +50,7 @@ check: check-models check-validation # add additional schemas to lint here check-models: \ check-model-data-access \ + check-model-datalad-dataset-components \ check-model-datalad-dataset-version \ check-model-ontology # check-model-git-provenance @@ -80,6 +82,8 @@ check-model-%: src/linkml/schemas/%.yaml check-validation: \ convert-examples-data-access \ check-validation-data-access \ + convert-examples-datalad-dataset-components \ + check-validation-datalad-dataset-components \ convert-examples-datalad-dataset-version \ check-validation-datalad-dataset-version \ convert-examples-ontology @@ -100,6 +104,7 @@ check-invalid-validation-%: tests/%-schema/validation src/linkml/schemas/%.yaml convert-examples: \ convert-examples-data-access \ + convert-examples-datalad-dataset-components \ convert-examples-datalad-dataset-version \ convert-examples-ontology # convert-examples-git-provenance diff --git a/src/examples/datalad-dataset-components/ContainerSE-DataladDataset-minimal.json b/src/examples/datalad-dataset-components/ContainerSE-DataladDataset-minimal.json new file mode 100644 index 0000000..732a13c --- /dev/null +++ b/src/examples/datalad-dataset-components/ContainerSE-DataladDataset-minimal.json @@ -0,0 +1,10 @@ +{ + "components": [ + { + "meta_id": "datalad:0b76362c-aa27-11ee-be29-b3b123281259", + "meta_type": "dlccs:DataladDatasetSE", + "uuid": "0b76362c-aa27-11ee-be29-b3b123281259" + } + ], + "@type": "ContainerSE" +} diff --git a/src/examples/datalad-dataset-components/ContainerSE-DataladDataset-minimal.rdf b/src/examples/datalad-dataset-components/ContainerSE-DataladDataset-minimal.rdf new file mode 100644 index 0000000..68a1ed7 --- /dev/null +++ b/src/examples/datalad-dataset-components/ContainerSE-DataladDataset-minimal.rdf @@ -0,0 +1,13 @@ +@prefix datalad: . +@prefix dlccs: . +@prefix dlco: . +@prefix ns1: . +@prefix xsd: . + +datalad:0b76362c-aa27-11ee-be29-b3b123281259 dlco:meta_type "dlccs:DataladDatasetSE"^^xsd:anyURI ; + dlco:uuid "0b76362c-aa27-11ee-be29-b3b123281259"^^ . + +[] a dlccs:Container ; + ns1:components datalad:0b76362c-aa27-11ee-be29-b3b123281259 . + + diff --git a/src/examples/datalad-dataset-components/ContainerSE-DataladDataset-minimal.yaml b/src/examples/datalad-dataset-components/ContainerSE-DataladDataset-minimal.yaml new file mode 100644 index 0000000..4a31305 --- /dev/null +++ b/src/examples/datalad-dataset-components/ContainerSE-DataladDataset-minimal.yaml @@ -0,0 +1,4 @@ +components: + - meta_id: datalad:0b76362c-aa27-11ee-be29-b3b123281259 + meta_type: dlccs:DataladDatasetSE + uuid: 0b76362c-aa27-11ee-be29-b3b123281259 diff --git a/src/examples/datalad-dataset-components/ContainerSE-DataladDatasetVersion-linkage.json b/src/examples/datalad-dataset-components/ContainerSE-DataladDatasetVersion-linkage.json new file mode 100644 index 0000000..6a3c8a1 --- /dev/null +++ b/src/examples/datalad-dataset-components/ContainerSE-DataladDatasetVersion-linkage.json @@ -0,0 +1,15 @@ +{ + "components": [ + { + "meta_id": "datalad:0b76362c-aa27-11ee-be29-b3b123281259", + "meta_type": "dlccs:DataladDatasetSE", + "uuid": "0b76362c-aa27-11ee-be29-b3b123281259" + }, + { + "meta_id": "gitsha:558275f650574389dcbbf7cd8ab5046482473fc8", + "meta_type": "dlccs:DataladDatasetVersionSE", + "is_version_of": "datalad:0b76362c-aa27-11ee-be29-b3b123281259" + } + ], + "@type": "ContainerSE" +} diff --git a/src/examples/datalad-dataset-components/ContainerSE-DataladDatasetVersion-linkage.rdf b/src/examples/datalad-dataset-components/ContainerSE-DataladDatasetVersion-linkage.rdf new file mode 100644 index 0000000..bb91148 --- /dev/null +++ b/src/examples/datalad-dataset-components/ContainerSE-DataladDatasetVersion-linkage.rdf @@ -0,0 +1,19 @@ +@prefix DCAT: . +@prefix datalad: . +@prefix dlccs: . +@prefix dlco: . +@prefix gitsha: . +@prefix ns1: . +@prefix xsd: . + +gitsha:558275f650574389dcbbf7cd8ab5046482473fc8 DCAT:isVersionOf datalad:0b76362c-aa27-11ee-be29-b3b123281259 ; + dlco:meta_type "dlccs:DataladDatasetVersionSE"^^xsd:anyURI . + +datalad:0b76362c-aa27-11ee-be29-b3b123281259 dlco:meta_type "dlccs:DataladDatasetSE"^^xsd:anyURI ; + dlco:uuid "0b76362c-aa27-11ee-be29-b3b123281259"^^ . + +[] a dlccs:Container ; + ns1:components datalad:0b76362c-aa27-11ee-be29-b3b123281259, + gitsha:558275f650574389dcbbf7cd8ab5046482473fc8 . + + diff --git a/src/examples/datalad-dataset-components/ContainerSE-DataladDatasetVersion-linkage.yaml b/src/examples/datalad-dataset-components/ContainerSE-DataladDatasetVersion-linkage.yaml new file mode 100644 index 0000000..c630840 --- /dev/null +++ b/src/examples/datalad-dataset-components/ContainerSE-DataladDatasetVersion-linkage.yaml @@ -0,0 +1,7 @@ +components: + - meta_id: datalad:0b76362c-aa27-11ee-be29-b3b123281259 + meta_type: dlccs:DataladDatasetSE + uuid: 0b76362c-aa27-11ee-be29-b3b123281259 + - meta_id: gitsha:558275f650574389dcbbf7cd8ab5046482473fc8 + meta_type: dlccs:DataladDatasetVersionSE + is_version_of: datalad:0b76362c-aa27-11ee-be29-b3b123281259 diff --git a/src/examples/datalad-dataset-components/DataladDatasetSE-minimal.json b/src/examples/datalad-dataset-components/DataladDatasetSE-minimal.json new file mode 100644 index 0000000..e5722ef --- /dev/null +++ b/src/examples/datalad-dataset-components/DataladDatasetSE-minimal.json @@ -0,0 +1,6 @@ +{ + "meta_id": "datalad:0b76362c-aa27-11ee-be29-b3b123281259", + "meta_type": "dlccs:DataladDatasetSE", + "uuid": "0b76362c-aa27-11ee-be29-b3b123281259", + "@type": "DataladDatasetSE" +} diff --git a/src/examples/datalad-dataset-components/DataladDatasetSE-minimal.rdf b/src/examples/datalad-dataset-components/DataladDatasetSE-minimal.rdf new file mode 100644 index 0000000..ddd723e --- /dev/null +++ b/src/examples/datalad-dataset-components/DataladDatasetSE-minimal.rdf @@ -0,0 +1,8 @@ +@prefix datalad: . +@prefix dlco: . +@prefix xsd: . + +datalad:0b76362c-aa27-11ee-be29-b3b123281259 dlco:meta_type "dlccs:DataladDatasetSE"^^xsd:anyURI ; + dlco:uuid "0b76362c-aa27-11ee-be29-b3b123281259"^^ . + + diff --git a/src/examples/datalad-dataset-components/DataladDatasetSE-minimal.yaml b/src/examples/datalad-dataset-components/DataladDatasetSE-minimal.yaml new file mode 100644 index 0000000..dd5abea --- /dev/null +++ b/src/examples/datalad-dataset-components/DataladDatasetSE-minimal.yaml @@ -0,0 +1,3 @@ +meta_id: datalad:0b76362c-aa27-11ee-be29-b3b123281259 +meta_type: dlccs:DataladDatasetSE +uuid: 0b76362c-aa27-11ee-be29-b3b123281259 diff --git a/src/extra-docs/datalad-dataset-components-schema/about.md b/src/extra-docs/datalad-dataset-components-schema/about.md new file mode 100644 index 0000000..e69de29 diff --git a/src/extra-docs/index.md b/src/extra-docs/index.md index 1ccc06b..afee2c2 100644 --- a/src/extra-docs/index.md +++ b/src/extra-docs/index.md @@ -1,4 +1,5 @@ - [DataLad concepts ontology](ontology/) +- [DataLad dataset components schema](datalad-dataset-components-schema/) - [DataLad dataset version schema](datalad-dataset-version-schema/) - [Data access schema](data-access-schema/) - [Git provenance schema](git-provenance-schema/) diff --git a/src/linkml/ontology/datalad.yaml b/src/linkml/ontology/datalad.yaml index 44ce3db..c95e9c4 100644 --- a/src/linkml/ontology/datalad.yaml +++ b/src/linkml/ontology/datalad.yaml @@ -39,3 +39,5 @@ classes: distinguishing feature between a DataLad dataset and a plain Git or git-annex repository branch. required: true + close_mappings: + - dlco:Dataset diff --git a/src/linkml/schemas/datalad-dataset-components.yaml b/src/linkml/schemas/datalad-dataset-components.yaml new file mode 100644 index 0000000..5cecde6 --- /dev/null +++ b/src/linkml/schemas/datalad-dataset-components.yaml @@ -0,0 +1,98 @@ +id: https://concepts.datalad.org/schemas/datalad-dataset-components +name: datalad-dataset-components +title: Schema for any DataLad dataset component +description: | + This specification defines a schema for describing a DataLad dataset + via a flat list of linked metadata objects. Each metadata object + has a simple structure with minimal property nesting. + + Each object is of type `ComponentSE`, a class that has two required slots: + `meta_id` and `meta_type`. The former must be a unique object identifier, + and the latter must identify a subclass of `ComponentSE` that the object + represents. Both value must be given as a CURIE. + + Nesting of objects is avoided, unless a DataLad dataset contains insufficient + information to guarantee the availability of a unique identifier for a given + entity. + + Subclasses of `ComponentSE` represent all recognized dataset components. + These classes, within this schema, only define the nature of the employed + identifier, and the linkage to other dataset components. All other + properties are defined by a corresponding ontology class that is applied + as a "mixin". + + An additional `ContainerSE` class is provided. It has a single slot + `components`, and can be used to represent a list of components for formats + and use cases that require a root class/instance. + +prefixes: + annex: https://concepts.datalad.org/namespace/annex-uuid/ + datalad: https://concepts.datalad.org/namespace/dataset-uuid/ + DCAT: http://www.w3.org/ns/dcat# + dcterms: http://purl.org/dc/terms/ + dlco: https://concepts.datalad.org/ontology/ + dlccs: https://concepts.datalad.org/schemas/datalad-dataset-components/ + gitsha: https://concepts.datalad.org/namespace/gitsha/ + linkml: https://w3id.org/linkml/ + prov: http://www.w3.org/ns/prov# + spdx: http://spdx.org/rdf/terms# + +imports: + - ../ontology/meta_utils + - ../ontology/datalad + - ../ontology/git-annex + +classes: + ContainerSE: + class_uri: dlccs:Container + description: >- + A container for dataset component objects. + tree_root: true + attributes: + components: + description: >- + Component list. + multivalued: true + inlined_as_list: true + range: ComponentSE + + ComponentSE: + class_uri: dlccs:Component + description: >- + Base class for any recognized dataset component type. This class + should never be used directly, only its subclasses. + slots: + - meta_id + - meta_type + slot_usage: + meta_type: + required: true + + DataladDatasetVersionSE: + class_uri: dlccs:DataladDatasetVersionSE + is_a: ComponentSE + description: >- + TODO + mixins: + - DataladDatasetVersion + slot_usage: + is_version_of: + inlined: false + range: DataladDatasetSE + meta_id: + equals_expression: "gitsha:{gitsha}" + + DataladDatasetSE: + class_uri: dlccs:DataladDatasetSE + is_a: ComponentSE + description: >- + Schema element for a `DataladDataset`. + see_also: + - dlco:DataladDataset + comments: + - The required identifier format is `datalad:`. + mixins: + - DataladDataset + slot_usage: + meta_id: + equals_expression: "datalad:{uuid}" diff --git a/src/linkml/schemas/ontology.yaml b/src/linkml/schemas/ontology.yaml index e51baf9..e6e1893 100644 --- a/src/linkml/schemas/ontology.yaml +++ b/src/linkml/schemas/ontology.yaml @@ -1,4 +1,4 @@ -id: https://concepts.datalad.org/schemas/ontology +id: https://concepts.datalad.org/ontology name: datalad-concepts-ontology title: DataLad Concepts Ontology (DLCO) description: |- diff --git a/tests/datalad-dataset-components-schema/validation/Container-DataladDatasetSE-minimal.yaml b/tests/datalad-dataset-components-schema/validation/Container-DataladDatasetSE-minimal.yaml new file mode 100644 index 0000000..2d392b7 --- /dev/null +++ b/tests/datalad-dataset-components-schema/validation/Container-DataladDatasetSE-minimal.yaml @@ -0,0 +1,5 @@ +components: + - meta_id: datalad:0b76362c-aa27-11ee-be29-b3b123281259 + meta_type: dlccs:DataladDataset + #meta_type: DataladDatasetSE + uuid: 0b76362c-aa27-11ee-be29-b3b123281259 diff --git a/tests/datalad-dataset-components-schema/validation/ContainerSE.valid.cfg.yaml b/tests/datalad-dataset-components-schema/validation/ContainerSE.valid.cfg.yaml new file mode 100644 index 0000000..6c88ca7 --- /dev/null +++ b/tests/datalad-dataset-components-schema/validation/ContainerSE.valid.cfg.yaml @@ -0,0 +1,10 @@ +schema: src/linkml/schemas/datalad-dataset-components.yaml +target_class: ContainerSE +data_sources: + - src/examples/datalad-dataset-components/ContainerSE-DataladDataset-minimal.yaml + - src/examples/datalad-dataset-components/ContainerSE-DataladDatasetVersion-linkage.yaml +plugins: + JsonschemaValidationPlugin: + closed: true + include_range_class_descendants: false + RecommendedSlotsPlugin: