aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-08-02 20:25:54 +0300
committerMunyoki Kilyungi2023-08-02 20:41:59 +0300
commit2fada0707238f81c9f5a8c36437c8ef6c608ed4d (patch)
tree17f9279850428ded4b3ef89cad04601d829ddfdc
parent4ee13bab45aa1d1c878cb8afd1a7f72348cb17ed (diff)
downloadgn-transform-databases-2fada0707238f81c9f5a8c36437c8ef6c608ed4d.tar.gz
Re-model how datasets are fetched
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-xexamples/dump-dataset-metadata.scm168
1 files changed, 93 insertions, 75 deletions
diff --git a/examples/dump-dataset-metadata.scm b/examples/dump-dataset-metadata.scm
index 66c0652..d1ec695 100755
--- a/examples/dump-dataset-metadata.scm
+++ b/examples/dump-dataset-metadata.scm
@@ -80,125 +80,139 @@
(left-join GeneChip "USING (GeneChipId)"))
"WHERE GN_AccesionId IS NOT NULL")
(schema-triples
- (gnt:dataset rdfs:range rdfs:Literal)
- (gnt:datasetOfInvestigator rdfs:domain gn:dataset)
- (gnt:datasetOfOrganization rdfs:domain gn:dataset)
- (gnt:datasetOfInvestigator rdfs:range foaf:Person)
- (gnt:datasetOfInbredSet rdfs:domain gn:dataset)
- (gnt:datasetOfInbredSet rdfs:range gn:inbredSet)
- (gnt:datasetOfSpecies rdfs:domain gn:dataset)
- (gnt:datasetOfSpecies rdfs:range gn:inbredSet)
- (gnt:datasetOfTissue rdfs:domain gn:dataset)
- (gnt:datasetOfTissue rdfs:range gn:tissue)
- (gnt:normalization rdfs:domain gn:dataset)
- (gnt:normalization rdfs:range gn:avgMethod)
- (gnt:datasetOfPlatform rdfs:domain gn:dataset)
- (gnt:datasetOfPlatform rdfs:range gn:geneChip)
- (gnt:accessionId rdfs:range rdfs:Literal)
- (gnt:datasetStatusName rdfs:range rdfs:Literal)
- (gnt:summary rdfs:range rdfs:Literal)
- (gnt:aboutTissue rdfs:range rdfs:Literal)
- (gnt:geoSeries rdfs:range rdfs:Literal)
- (gnt:name rdfs:range rdfs:Literal)
- (gnt:title rdfs:range rdfs:Literal)
- (gnt:publicationTitle rdfs:range rdfs:Literal)
- (gnt:specifics rdfs:range rdfs:Literal)
- (gnt:datasetGroup rdfs:range rdfs:Literal)
- (gnt:aboutCases rdfs:range rdfs:Literal)
- (gnt:aboutPlatform rdfs:range rdfs:Literal)
- (gnt:aboutDataProcessing rdfs:range rdfs:Literal)
- (gnt:notes rdfs:range rdfs:Literal)
- (gnt:experimentDesign rdfs:range rdfs:Literal)
- (gnt:contributors rdfs:range rdfs:Literal)
- (gnt:citation rdfs:range rdfs:Literal)
- (gnt:acknowledgment rdfs:range rdfs:Literal))
+ (gnc:dataset rdf:type gdmt:Dataset)
+ (gnc:genotype rdfs:subClassOf gnc:dataset)
+ (gnc:phenotype rdfs:subClassOf gnc:dataset)
+ (gnt:belongsToInbredSet rdfs:domain gnc:dataset)
+ (gnt:belongsToInbredSet a owl:ObjectProperty)
+ (gnt:belongsToInbredSet skos:definition "The InbredSet this resource belongs to")
+ (gnt:hasTissue rdfs:domain gnc:dataset)
+ (gnt:hasTissue a owl:ObjectProperty)
+ (gnt:hasTissue skos:definition "Tissues this resource has")
+ (gnt:hasTissueInfo rdfs:domain gnc:dataset)
+ (gnt:hasTissueInfo a owl:ObjectProperty)
+ (gnt:hasTissueInfo skos:definition "Metadata about Tissue for this resource")
+ (gnt:usedNormalization rdfs:domain gnc:dataset)
+ (gnt:usedNormalization a owl:ObjectProperty)
+ (gnt:usedNormalization skos:definition "Normalization techniques this resource has")
+ (gnt:usedPlatform rdfs:domain gnc:dataset)
+ (gnt:usedPlatform a owl:ObjectProperty)
+ (gnt:usedPlatform skos:definition "The Platform this resource uses")
+ (gnt:hasGeoSeriesId rdfs:domain gnc:dataset)
+ (gnt:hasGeoSeriesId a owl:ObjectProperty)
+ (gnt:hasGeoSeriesId skos:definition "id of record in NCBI database")
+ (gnt:hasExperimentDesignInfo rdfs:domain gnc:dataset)
+ (gnt:hasExperimentDesignInfo rdfs:label "Experiment Design")
+ (gnt:hasExperimentDesignInfo a owl:ObjectProperty)
+ (gnt:hasExperimentDesignInfo skos:definition "Information about how the experiment was designed")
+ (gnt:hasNotes rdfs:domain gnc:dataset)
+ (gnt:hasNotes a owl:ObjectProperty)
+ (gnt:hasNotes rdfs:label "Notes")
+ (gnt:hasNotes skos:definition "Extra Notes about this dataset")
+ (gnt:hasDataProcessingInfo rdfs:domain gnc:dataset)
+ (gnt:hasDataProcessingInfo rdfs:label "About Data Processing")
+ (gnt:hasDataProcessingInfo a owl:ObjectProperty)
+ (gnt:hasDataProcessingInfo skos:definition "Information about how this dataset was processed")
+ (gnt:hasPlatformInfo rdfs:domain gnc:dataset)
+ (gnt:hasPlatformInfo a owl:ObjectProperty)
+ (gnt:hasPlatformInfo rdfs:label "About Platfoorm")
+ (gnt:hasPlatformInfo skos:definition "Information about the platform that was used with this dataset")
+ (gnt:hasCaseInfo rdfs:domain gnc:dataset)
+ (gnt:hasCaseInfo rdfs:label "About Case")
+ (gnt:hasCaseInfo a owl:ObjectProperty)
+ (gnt:hasCaseInfo skos:definition "Information about the cases used in this platform")
+ (gnt:hasAcknowledgement rdfs:domain gnc:dataset)
+ (gnt:hasAcknowledgement rdfs:label "Acknowledgement")
+ (gnt:hasAcknowledgement a owl:ObjectProperty)
+ (gnt:hasAcknowledgement skos:definition "People to acknowledg"))
(triples (string->identifier
"" (regexp-substitute/global #f "[^A-Za-z0-9:]"
- (field InfoFiles InfoPageName)
- 'pre "_" 'post)
+ (field InfoFiles InfoPageName)
+ 'pre "_" 'post)
#:separator ""
#:proc string-capitalize-first)
(set rdf:type (string->symbol
- (field ("IF(GenoFreeze.Id IS NOT NULL, 'gn:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gn:phenotypeDataset', 'gn:dataset'))"
+ (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotype', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotype', 'gnc:dataset'))"
rdfType))))
- (set gnt:name (regexp-substitute/global
- #f "^[Nn]one$"
- (field InfoFiles InfoPageName)
- ""))
- (set gnt:fullName
+ (set rdfs:label (regexp-substitute/global
+ #f "^[Nn]one$"
+ (field InfoFiles InfoPageName)
+ ""))
+ (set gdmt:hasTitleInfo
(field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))"
DatasetFullName)))
+ (set gdmt:hasTitleInfo (field Datasets DatasetName DatasetGroup))
+ (set gdmt:hasTitleInfo
+ (regexp-substitute/global
+ #f "^[Nn]one$"
+ (field InfoFiles InfoFileTitle)
+ ""))
+ ;; This is the published title
+ (set dct:title
+ (regexp-substitute/global
+ #f "^[Nn]one$"
+ (field Datasets PublicationTitle)
+ ""))
(set dct:created
(field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))"
createTimeGenoFreeze)))
- (set gnt:datasetOfInvestigator
+ (set gdmt:hasCreatorInfo
(investigator-attributes->id (field Investigators FirstName)
(field Investigators LastName)
(field Investigators Email)))
- (set gnt:datasetOfOrganization
+ (set gdmt:hasCreatorAffiliation
(field ("CAST(CONVERT(BINARY CONVERT(Organizations.OrganizationName USING latin1) USING utf8) AS VARCHAR(1500))" Organizations)))
- (set gnt:accessionId (format #f "GN~a" (field InfoFiles GN_AccesionId)))
- (set gnt:datasetStatusName (string-downcase
- (field DatasetStatus DatasetStatusName)))
- (set gnt:datasetOfInbredSet
+ (set gdmt:hasDatasetIdentifierSubType (format #f "GN~a" (field InfoFiles GN_AccesionId)))
+ (set gdmt:hasRightsInfo (string-downcase
+ (field DatasetStatus DatasetStatusName)))
+ (set gnt:belongsToInbredSet
(string->identifier "inbredSet" (field InbredSet Name InbredSetName)))
- (set gnt:datasetOfTissue (string->identifier "tissue"
- (field Tissue Short_Name)))
- (set gnt:normalization
+ (set gnt:hasTissue (string->identifier "tissue"
+ (field Tissue Short_Name)))
+ (set gnt:usedNormalization
(string->identifier "avgmethod"
;; If AvgMethodName is NULL, assume N/A.
(if (string-blank? (field AvgMethod Name AvgMethodName))
"N/A" (field AvgMethod Name AvgMethodName))))
- (set gnt:datasetOfPlatform
+ (set gnt:usedPlatform
(string->identifier "platform"
(field GeneChip Name GeneChip)))
- (set gnt:summary
+ (set gdmt:isDescribedBy
(sanitize-rdf-string (field Datasets Summary)))
- (set gnt:aboutTissue
- (sanitize-rdf-string (field Datasets AboutTissue)))
- (set gnt:geoSeries
+ (set gnt:hasGeoSeriesId
(let ((s
(string-match "GSE[0-9]*"
(field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries)))))
(if s (ontology
'geoSeries: (match:substring s))
"")))
- (set gnt:title
- (regexp-substitute/global
- #f "^[Nn]one$"
- (field InfoFiles InfoFileTitle)
- ""))
- (set gnt:publicationTitle
- (regexp-substitute/global
- #f "^[Nn]one$"
- (field Datasets PublicationTitle)
- ""))
- (set gnt:specifics (sanitize-rdf-string (field InfoFiles Specifics)))
- (set gnt:datasetGroup (field Datasets DatasetName DatasetGroup))
- (set gnt:aboutCases
+ (set gnt:hasTissueInfo
+ (sanitize-rdf-string (field Datasets AboutTissue)))
+ (set gnt:hasContentInfo (sanitize-rdf-string (field InfoFiles Specifics)))
+ (set gnt:hasCaseInfo
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutCases USING latin1) USING utf8) AS VARCHAR(10000))" AboutCases))))
- (set gnt:aboutPlatform
+ (set gnt:hasPlatformInfo
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutPlatform USING latin1) USING utf8) AS VARCHAR(1500))"
AboutPlatform))))
- (set gnt:aboutDataProcessing
+ (set gnt:hasDataProcessingInfo
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutDataProcessing USING latin1) USING utf8) AS VARCHAR(1500))"
AboutDataProcessing))))
- (set gnt:notes
+ (set gnt:hasNotes
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.Notes USING latin1) USING utf8) AS VARCHAR(1500))"
GNNotes))))
- (set gnt:experimentDesign
+ (set gnt:hasExperimentDesignInfo
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.ExperimentDesign USING latin1) USING utf8) AS VARCHAR(1500))"
ExperimentDesign))))
- (set gnt:contributors
+ (set gdmt:hasContributorInfo
(sanitize-rdf-string
(field ("CAST(CONVERT(BINARY CONVERT(Datasets.Contributors USING latin1) USING utf8) AS VARCHAR(1500))"
Contributors))))
- (set gnt:citation
+ (set gdmt:IsCitedBy
(sanitize-rdf-string
(regexp-substitute/global
#f "^[Nn]one$"
@@ -206,7 +220,7 @@
("CAST(CONVERT(BINARY CONVERT(Datasets.Citation USING latin1) USING utf8) AS VARCHAR(1500))"
Citation))
"")))
- (set gnt:dataSourceAcknowledgment
+ (set gnt:hasAcknowledgement
(sanitize-rdf-string
(string-trim-both
(regexp-substitute/global
@@ -214,7 +228,7 @@
(field ("CAST(CONVERT(BINARY CONVERT(InfoFiles.Data_Source_Acknowledge USING latin1) USING utf8) AS VARCHAR(1500))"
Data_Source_Acknowledge))
""))))
- (set gnt:acknowledgment (sanitize-rdf-string
+ (set gnt:hasAcknowledgement (sanitize-rdf-string
(field Datasets Acknowledgment)))))
@@ -227,10 +241,14 @@
(prefixes
'(("v:" "<http://www.w3.org/2006/vcard/ns#>")
("foaf:" "<http://xmlns.com/foaf/0.1/>")
+ ("gdmt:" "<http://vocab.fairdatacollective.org/gdmt/>")
+ ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>")
("gnt:" "<http://genenetwork.org/term/>")
("gn:" "<http://genenetwork.org/id/>")
+ ("gnc:" "<http://genenetwork.org/category/>")
("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
+ ("owl:" "<http://www.w3.org/2002/07/owl#>")
("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
("taxon:" "<http://purl.uniprot.org/taxonomy/>")
("dct:" "<http://purl.org/dc/terms/>")))