about summary refs log tree commit diff
diff options
context:
space:
mode:
-rwxr-xr-xexamples/dump-dataset-metadata.scm168
1 files changed, 93 insertions, 75 deletions
diff --git a/examples/dump-dataset-metadata.scm b/examples/dump-dataset-metadata.scm
index 66c0652..d1ec695 100755
--- a/examples/dump-dataset-metadata.scm
+++ b/examples/dump-dataset-metadata.scm
@@ -80,125 +80,139 @@
            (left-join GeneChip "USING (GeneChipId)"))
           "WHERE GN_AccesionId IS NOT NULL")
   (schema-triples
-   (gnt:dataset rdfs:range rdfs:Literal)
-   (gnt:datasetOfInvestigator rdfs:domain gn:dataset)
-   (gnt:datasetOfOrganization rdfs:domain gn:dataset)
-   (gnt:datasetOfInvestigator rdfs:range foaf:Person)
-   (gnt:datasetOfInbredSet rdfs:domain gn:dataset)
-   (gnt:datasetOfInbredSet rdfs:range gn:inbredSet)
-   (gnt:datasetOfSpecies rdfs:domain gn:dataset)
-   (gnt:datasetOfSpecies rdfs:range gn:inbredSet)
-   (gnt:datasetOfTissue rdfs:domain gn:dataset)
-   (gnt:datasetOfTissue rdfs:range gn:tissue)
-   (gnt:normalization rdfs:domain gn:dataset)
-   (gnt:normalization rdfs:range gn:avgMethod)
-   (gnt:datasetOfPlatform rdfs:domain gn:dataset)
-   (gnt:datasetOfPlatform rdfs:range gn:geneChip)
-   (gnt:accessionId rdfs:range rdfs:Literal)
-   (gnt:datasetStatusName rdfs:range rdfs:Literal)
-   (gnt:summary rdfs:range rdfs:Literal)
-   (gnt:aboutTissue rdfs:range rdfs:Literal)
-   (gnt:geoSeries rdfs:range rdfs:Literal)
-   (gnt:name rdfs:range rdfs:Literal)
-   (gnt:title rdfs:range rdfs:Literal)
-   (gnt:publicationTitle rdfs:range rdfs:Literal)
-   (gnt:specifics rdfs:range rdfs:Literal)
-   (gnt:datasetGroup rdfs:range rdfs:Literal)
-   (gnt:aboutCases rdfs:range rdfs:Literal)
-   (gnt:aboutPlatform rdfs:range rdfs:Literal)
-   (gnt:aboutDataProcessing rdfs:range rdfs:Literal)
-   (gnt:notes rdfs:range rdfs:Literal)
-   (gnt:experimentDesign rdfs:range rdfs:Literal)
-   (gnt:contributors rdfs:range rdfs:Literal)
-   (gnt:citation rdfs:range rdfs:Literal)
-   (gnt:acknowledgment rdfs:range rdfs:Literal))
+   (gnc:dataset rdf:type gdmt:Dataset)
+   (gnc:genotype rdfs:subClassOf gnc:dataset)
+   (gnc:phenotype rdfs:subClassOf gnc:dataset)
+   (gnt:belongsToInbredSet rdfs:domain gnc:dataset)
+   (gnt:belongsToInbredSet a owl:ObjectProperty)
+   (gnt:belongsToInbredSet skos:definition "The InbredSet this resource belongs to")
+   (gnt:hasTissue rdfs:domain gnc:dataset)
+   (gnt:hasTissue a owl:ObjectProperty)
+   (gnt:hasTissue skos:definition "Tissues this resource has")
+   (gnt:hasTissueInfo rdfs:domain gnc:dataset)
+   (gnt:hasTissueInfo a owl:ObjectProperty)
+   (gnt:hasTissueInfo skos:definition "Metadata about Tissue for this resource")
+   (gnt:usedNormalization rdfs:domain gnc:dataset)
+   (gnt:usedNormalization a owl:ObjectProperty)
+   (gnt:usedNormalization skos:definition "Normalization techniques this resource has")
+   (gnt:usedPlatform rdfs:domain gnc:dataset)
+   (gnt:usedPlatform a owl:ObjectProperty)
+   (gnt:usedPlatform skos:definition "The Platform this resource uses")
+   (gnt:hasGeoSeriesId rdfs:domain gnc:dataset)
+   (gnt:hasGeoSeriesId a owl:ObjectProperty)
+   (gnt:hasGeoSeriesId skos:definition "id of record in NCBI database")
+   (gnt:hasExperimentDesignInfo rdfs:domain gnc:dataset)
+   (gnt:hasExperimentDesignInfo rdfs:label "Experiment Design")
+   (gnt:hasExperimentDesignInfo a owl:ObjectProperty)
+   (gnt:hasExperimentDesignInfo skos:definition "Information about how the experiment was designed")
+   (gnt:hasNotes rdfs:domain gnc:dataset)
+   (gnt:hasNotes a owl:ObjectProperty)
+   (gnt:hasNotes rdfs:label "Notes")
+   (gnt:hasNotes skos:definition "Extra Notes about this dataset")
+   (gnt:hasDataProcessingInfo rdfs:domain gnc:dataset)
+   (gnt:hasDataProcessingInfo rdfs:label "About Data Processing")
+   (gnt:hasDataProcessingInfo a owl:ObjectProperty)
+   (gnt:hasDataProcessingInfo skos:definition "Information about how this dataset was processed")
+   (gnt:hasPlatformInfo rdfs:domain gnc:dataset)
+   (gnt:hasPlatformInfo a owl:ObjectProperty)
+   (gnt:hasPlatformInfo rdfs:label "About Platfoorm")
+   (gnt:hasPlatformInfo skos:definition "Information about the platform that was used with this dataset")
+   (gnt:hasCaseInfo rdfs:domain gnc:dataset)
+   (gnt:hasCaseInfo rdfs:label "About Case")
+   (gnt:hasCaseInfo a owl:ObjectProperty)
+   (gnt:hasCaseInfo skos:definition "Information about the cases used in this platform")
+   (gnt:hasAcknowledgement rdfs:domain gnc:dataset)
+   (gnt:hasAcknowledgement rdfs:label "Acknowledgement")
+   (gnt:hasAcknowledgement a owl:ObjectProperty)
+   (gnt:hasAcknowledgement skos:definition "People to acknowledg"))
   (triples (string->identifier
             "" (regexp-substitute/global #f "[^A-Za-z0-9:]"
-                                        (field InfoFiles InfoPageName)
-                                        'pre "_" 'post)
+                                         (field InfoFiles InfoPageName)
+                                         'pre "_" 'post)
             #:separator ""
             #:proc string-capitalize-first)
     (set rdf:type (string->symbol
-                   (field ("IF(GenoFreeze.Id IS NOT NULL, 'gn:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gn:phenotypeDataset', 'gn:dataset'))"
+                   (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotype', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotype', 'gnc:dataset'))"
                            rdfType))))
-    (set gnt:name (regexp-substitute/global
-                       #f "^[Nn]one$"
-                       (field InfoFiles InfoPageName)
-                       ""))
-    (set gnt:fullName
+    (set rdfs:label (regexp-substitute/global
+                     #f "^[Nn]one$"
+                     (field InfoFiles InfoPageName)
+                     ""))
+    (set gdmt:hasTitleInfo
          (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))"
                  DatasetFullName)))
+    (set gdmt:hasTitleInfo (field Datasets DatasetName DatasetGroup))
+    (set gdmt:hasTitleInfo
+         (regexp-substitute/global
+          #f "^[Nn]one$"
+          (field InfoFiles InfoFileTitle)
+          ""))
+    ;; This is the published title
+    (set dct:title
+         (regexp-substitute/global
+          #f "^[Nn]one$"
+          (field Datasets PublicationTitle)
+          ""))
     (set dct:created
          (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))"
                  createTimeGenoFreeze)))
-    (set gnt:datasetOfInvestigator
+    (set gdmt:hasCreatorInfo
          (investigator-attributes->id (field Investigators FirstName)
                                       (field Investigators LastName)
                                       (field Investigators Email)))
-    (set gnt:datasetOfOrganization
+    (set gdmt:hasCreatorAffiliation
          (field ("CAST(CONVERT(BINARY CONVERT(Organizations.OrganizationName USING latin1) USING utf8) AS VARCHAR(1500))" Organizations)))
-    (set gnt:accessionId (format #f "GN~a" (field InfoFiles GN_AccesionId)))
-    (set gnt:datasetStatusName (string-downcase
-                                    (field DatasetStatus DatasetStatusName)))
-    (set gnt:datasetOfInbredSet
+    (set gdmt:hasDatasetIdentifierSubType (format #f "GN~a" (field InfoFiles GN_AccesionId)))
+    (set gdmt:hasRightsInfo (string-downcase
+                             (field DatasetStatus DatasetStatusName)))
+    (set gnt:belongsToInbredSet
          (string->identifier "inbredSet" (field InbredSet Name InbredSetName)))
-    (set gnt:datasetOfTissue (string->identifier "tissue"
-                                                     (field Tissue Short_Name)))
-    (set gnt:normalization
+    (set gnt:hasTissue (string->identifier "tissue"
+                                           (field Tissue Short_Name)))
+    (set gnt:usedNormalization
          (string->identifier "avgmethod"
                              ;; If AvgMethodName is NULL, assume N/A.
                              (if (string-blank? (field AvgMethod Name AvgMethodName))
                                  "N/A" (field AvgMethod Name AvgMethodName))))
-    (set gnt:datasetOfPlatform
+    (set gnt:usedPlatform
          (string->identifier "platform"
                              (field GeneChip Name GeneChip)))
-    (set gnt:summary
+    (set gdmt:isDescribedBy
          (sanitize-rdf-string (field Datasets Summary)))
-    (set gnt:aboutTissue
-         (sanitize-rdf-string (field Datasets AboutTissue)))
-    (set gnt:geoSeries
+    (set gnt:hasGeoSeriesId
          (let ((s
                 (string-match "GSE[0-9]*"
                               (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries)))))
            (if s (ontology
                   'geoSeries: (match:substring s))
                "")))
-    (set gnt:title
-         (regexp-substitute/global
-          #f "^[Nn]one$"
-          (field InfoFiles InfoFileTitle)
-          ""))
-    (set gnt:publicationTitle
-         (regexp-substitute/global
-          #f "^[Nn]one$"
-          (field Datasets PublicationTitle)
-          ""))
-    (set gnt:specifics (sanitize-rdf-string (field InfoFiles Specifics)))
-    (set gnt:datasetGroup (field Datasets DatasetName DatasetGroup))
-    (set gnt:aboutCases
+    (set gnt:hasTissueInfo
+         (sanitize-rdf-string (field Datasets AboutTissue)))
+    (set gnt:hasContentInfo (sanitize-rdf-string (field InfoFiles Specifics)))
+    (set gnt:hasCaseInfo
          (sanitize-rdf-string
           (field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutCases USING latin1) USING utf8) AS VARCHAR(10000))" AboutCases))))
-    (set gnt:aboutPlatform
+    (set gnt:hasPlatformInfo
          (sanitize-rdf-string
           (field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutPlatform USING latin1) USING utf8) AS VARCHAR(1500))"
                   AboutPlatform))))
-    (set gnt:aboutDataProcessing
+    (set gnt:hasDataProcessingInfo
          (sanitize-rdf-string
           (field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutDataProcessing USING latin1) USING utf8) AS VARCHAR(1500))"
                   AboutDataProcessing))))
-    (set gnt:notes
+    (set gnt:hasNotes
          (sanitize-rdf-string
           (field ("CAST(CONVERT(BINARY CONVERT(Datasets.Notes USING latin1) USING utf8) AS VARCHAR(1500))"
                   GNNotes))))
-    (set gnt:experimentDesign
+    (set gnt:hasExperimentDesignInfo
          (sanitize-rdf-string
           (field ("CAST(CONVERT(BINARY CONVERT(Datasets.ExperimentDesign USING latin1) USING utf8) AS VARCHAR(1500))"
                   ExperimentDesign))))
-    (set gnt:contributors
+    (set gdmt:hasContributorInfo
          (sanitize-rdf-string
           (field ("CAST(CONVERT(BINARY CONVERT(Datasets.Contributors USING latin1) USING utf8) AS VARCHAR(1500))"
                   Contributors))))
-    (set gnt:citation
+    (set gdmt:IsCitedBy
          (sanitize-rdf-string
           (regexp-substitute/global
            #f "^[Nn]one$"
@@ -206,7 +220,7 @@
             ("CAST(CONVERT(BINARY CONVERT(Datasets.Citation USING latin1) USING utf8) AS VARCHAR(1500))"
              Citation))
            "")))
-    (set gnt:dataSourceAcknowledgment
+    (set gnt:hasAcknowledgement
          (sanitize-rdf-string
           (string-trim-both
            (regexp-substitute/global
@@ -214,7 +228,7 @@
             (field ("CAST(CONVERT(BINARY CONVERT(InfoFiles.Data_Source_Acknowledge USING latin1) USING utf8) AS VARCHAR(1500))"
                     Data_Source_Acknowledge))
             ""))))
-    (set gnt:acknowledgment (sanitize-rdf-string
+    (set gnt:hasAcknowledgement (sanitize-rdf-string
                                  (field Datasets Acknowledgment)))))
 
 
@@ -227,10 +241,14 @@
  (prefixes
   '(("v:" "<http://www.w3.org/2006/vcard/ns#>")
     ("foaf:" "<http://xmlns.com/foaf/0.1/>")
+    ("gdmt:" "<http://vocab.fairdatacollective.org/gdmt/>")
+    ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
     ("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>")
     ("gnt:" "<http://genenetwork.org/term/>")
     ("gn:" "<http://genenetwork.org/id/>")
+    ("gnc:" "<http://genenetwork.org/category/>")
     ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
+    ("owl:" "<http://www.w3.org/2002/07/owl#>")
     ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
     ("taxon:" "<http://purl.uniprot.org/taxonomy/>")
     ("dct:" "<http://purl.org/dc/terms/>")))