From 2fada0707238f81c9f5a8c36437c8ef6c608ed4d Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Wed, 2 Aug 2023 20:25:54 +0300 Subject: Re-model how datasets are fetched Signed-off-by: Munyoki Kilyungi --- examples/dump-dataset-metadata.scm | 168 ++++++++++++++++++++----------------- 1 file changed, 93 insertions(+), 75 deletions(-) (limited to 'examples') diff --git a/examples/dump-dataset-metadata.scm b/examples/dump-dataset-metadata.scm index 66c0652..d1ec695 100755 --- a/examples/dump-dataset-metadata.scm +++ b/examples/dump-dataset-metadata.scm @@ -80,125 +80,139 @@ (left-join GeneChip "USING (GeneChipId)")) "WHERE GN_AccesionId IS NOT NULL") (schema-triples - (gnt:dataset rdfs:range rdfs:Literal) - (gnt:datasetOfInvestigator rdfs:domain gn:dataset) - (gnt:datasetOfOrganization rdfs:domain gn:dataset) - (gnt:datasetOfInvestigator rdfs:range foaf:Person) - (gnt:datasetOfInbredSet rdfs:domain gn:dataset) - (gnt:datasetOfInbredSet rdfs:range gn:inbredSet) - (gnt:datasetOfSpecies rdfs:domain gn:dataset) - (gnt:datasetOfSpecies rdfs:range gn:inbredSet) - (gnt:datasetOfTissue rdfs:domain gn:dataset) - (gnt:datasetOfTissue rdfs:range gn:tissue) - (gnt:normalization rdfs:domain gn:dataset) - (gnt:normalization rdfs:range gn:avgMethod) - (gnt:datasetOfPlatform rdfs:domain gn:dataset) - (gnt:datasetOfPlatform rdfs:range gn:geneChip) - (gnt:accessionId rdfs:range rdfs:Literal) - (gnt:datasetStatusName rdfs:range rdfs:Literal) - (gnt:summary rdfs:range rdfs:Literal) - (gnt:aboutTissue rdfs:range rdfs:Literal) - (gnt:geoSeries rdfs:range rdfs:Literal) - (gnt:name rdfs:range rdfs:Literal) - (gnt:title rdfs:range rdfs:Literal) - (gnt:publicationTitle rdfs:range rdfs:Literal) - (gnt:specifics rdfs:range rdfs:Literal) - (gnt:datasetGroup rdfs:range rdfs:Literal) - (gnt:aboutCases rdfs:range rdfs:Literal) - (gnt:aboutPlatform rdfs:range rdfs:Literal) - (gnt:aboutDataProcessing rdfs:range rdfs:Literal) - (gnt:notes rdfs:range rdfs:Literal) - (gnt:experimentDesign rdfs:range rdfs:Literal) - (gnt:contributors rdfs:range rdfs:Literal) - (gnt:citation rdfs:range rdfs:Literal) - (gnt:acknowledgment rdfs:range rdfs:Literal)) + (gnc:dataset rdf:type gdmt:Dataset) + (gnc:genotype rdfs:subClassOf gnc:dataset) + (gnc:phenotype rdfs:subClassOf gnc:dataset) + (gnt:belongsToInbredSet rdfs:domain gnc:dataset) + (gnt:belongsToInbredSet a owl:ObjectProperty) + (gnt:belongsToInbredSet skos:definition "The InbredSet this resource belongs to") + (gnt:hasTissue rdfs:domain gnc:dataset) + (gnt:hasTissue a owl:ObjectProperty) + (gnt:hasTissue skos:definition "Tissues this resource has") + (gnt:hasTissueInfo rdfs:domain gnc:dataset) + (gnt:hasTissueInfo a owl:ObjectProperty) + (gnt:hasTissueInfo skos:definition "Metadata about Tissue for this resource") + (gnt:usedNormalization rdfs:domain gnc:dataset) + (gnt:usedNormalization a owl:ObjectProperty) + (gnt:usedNormalization skos:definition "Normalization techniques this resource has") + (gnt:usedPlatform rdfs:domain gnc:dataset) + (gnt:usedPlatform a owl:ObjectProperty) + (gnt:usedPlatform skos:definition "The Platform this resource uses") + (gnt:hasGeoSeriesId rdfs:domain gnc:dataset) + (gnt:hasGeoSeriesId a owl:ObjectProperty) + (gnt:hasGeoSeriesId skos:definition "id of record in NCBI database") + (gnt:hasExperimentDesignInfo rdfs:domain gnc:dataset) + (gnt:hasExperimentDesignInfo rdfs:label "Experiment Design") + (gnt:hasExperimentDesignInfo a owl:ObjectProperty) + (gnt:hasExperimentDesignInfo skos:definition "Information about how the experiment was designed") + (gnt:hasNotes rdfs:domain gnc:dataset) + (gnt:hasNotes a owl:ObjectProperty) + (gnt:hasNotes rdfs:label "Notes") + (gnt:hasNotes skos:definition "Extra Notes about this dataset") + (gnt:hasDataProcessingInfo rdfs:domain gnc:dataset) + (gnt:hasDataProcessingInfo rdfs:label "About Data Processing") + (gnt:hasDataProcessingInfo a owl:ObjectProperty) + (gnt:hasDataProcessingInfo skos:definition "Information about how this dataset was processed") + (gnt:hasPlatformInfo rdfs:domain gnc:dataset) + (gnt:hasPlatformInfo a owl:ObjectProperty) + (gnt:hasPlatformInfo rdfs:label "About Platfoorm") + (gnt:hasPlatformInfo skos:definition "Information about the platform that was used with this dataset") + (gnt:hasCaseInfo rdfs:domain gnc:dataset) + (gnt:hasCaseInfo rdfs:label "About Case") + (gnt:hasCaseInfo a owl:ObjectProperty) + (gnt:hasCaseInfo skos:definition "Information about the cases used in this platform") + (gnt:hasAcknowledgement rdfs:domain gnc:dataset) + (gnt:hasAcknowledgement rdfs:label "Acknowledgement") + (gnt:hasAcknowledgement a owl:ObjectProperty) + (gnt:hasAcknowledgement skos:definition "People to acknowledg")) (triples (string->identifier "" (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field InfoFiles InfoPageName) - 'pre "_" 'post) + (field InfoFiles InfoPageName) + 'pre "_" 'post) #:separator "" #:proc string-capitalize-first) (set rdf:type (string->symbol - (field ("IF(GenoFreeze.Id IS NOT NULL, 'gn:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gn:phenotypeDataset', 'gn:dataset'))" + (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotype', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotype', 'gnc:dataset'))" rdfType)))) - (set gnt:name (regexp-substitute/global - #f "^[Nn]one$" - (field InfoFiles InfoPageName) - "")) - (set gnt:fullName + (set rdfs:label (regexp-substitute/global + #f "^[Nn]one$" + (field InfoFiles InfoPageName) + "")) + (set gdmt:hasTitleInfo (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" DatasetFullName))) + (set gdmt:hasTitleInfo (field Datasets DatasetName DatasetGroup)) + (set gdmt:hasTitleInfo + (regexp-substitute/global + #f "^[Nn]one$" + (field InfoFiles InfoFileTitle) + "")) + ;; This is the published title + (set dct:title + (regexp-substitute/global + #f "^[Nn]one$" + (field Datasets PublicationTitle) + "")) (set dct:created (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" createTimeGenoFreeze))) - (set gnt:datasetOfInvestigator + (set gdmt:hasCreatorInfo (investigator-attributes->id (field Investigators FirstName) (field Investigators LastName) (field Investigators Email))) - (set gnt:datasetOfOrganization + (set gdmt:hasCreatorAffiliation (field ("CAST(CONVERT(BINARY CONVERT(Organizations.OrganizationName USING latin1) USING utf8) AS VARCHAR(1500))" Organizations))) - (set gnt:accessionId (format #f "GN~a" (field InfoFiles GN_AccesionId))) - (set gnt:datasetStatusName (string-downcase - (field DatasetStatus DatasetStatusName))) - (set gnt:datasetOfInbredSet + (set gdmt:hasDatasetIdentifierSubType (format #f "GN~a" (field InfoFiles GN_AccesionId))) + (set gdmt:hasRightsInfo (string-downcase + (field DatasetStatus DatasetStatusName))) + (set gnt:belongsToInbredSet (string->identifier "inbredSet" (field InbredSet Name InbredSetName))) - (set gnt:datasetOfTissue (string->identifier "tissue" - (field Tissue Short_Name))) - (set gnt:normalization + (set gnt:hasTissue (string->identifier "tissue" + (field Tissue Short_Name))) + (set gnt:usedNormalization (string->identifier "avgmethod" ;; If AvgMethodName is NULL, assume N/A. (if (string-blank? (field AvgMethod Name AvgMethodName)) "N/A" (field AvgMethod Name AvgMethodName)))) - (set gnt:datasetOfPlatform + (set gnt:usedPlatform (string->identifier "platform" (field GeneChip Name GeneChip))) - (set gnt:summary + (set gdmt:isDescribedBy (sanitize-rdf-string (field Datasets Summary))) - (set gnt:aboutTissue - (sanitize-rdf-string (field Datasets AboutTissue))) - (set gnt:geoSeries + (set gnt:hasGeoSeriesId (let ((s (string-match "GSE[0-9]*" (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries))))) (if s (ontology 'geoSeries: (match:substring s)) ""))) - (set gnt:title - (regexp-substitute/global - #f "^[Nn]one$" - (field InfoFiles InfoFileTitle) - "")) - (set gnt:publicationTitle - (regexp-substitute/global - #f "^[Nn]one$" - (field Datasets PublicationTitle) - "")) - (set gnt:specifics (sanitize-rdf-string (field InfoFiles Specifics))) - (set gnt:datasetGroup (field Datasets DatasetName DatasetGroup)) - (set gnt:aboutCases + (set gnt:hasTissueInfo + (sanitize-rdf-string (field Datasets AboutTissue))) + (set gnt:hasContentInfo (sanitize-rdf-string (field InfoFiles Specifics))) + (set gnt:hasCaseInfo (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutCases USING latin1) USING utf8) AS VARCHAR(10000))" AboutCases)))) - (set gnt:aboutPlatform + (set gnt:hasPlatformInfo (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutPlatform USING latin1) USING utf8) AS VARCHAR(1500))" AboutPlatform)))) - (set gnt:aboutDataProcessing + (set gnt:hasDataProcessingInfo (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Datasets.AboutDataProcessing USING latin1) USING utf8) AS VARCHAR(1500))" AboutDataProcessing)))) - (set gnt:notes + (set gnt:hasNotes (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Datasets.Notes USING latin1) USING utf8) AS VARCHAR(1500))" GNNotes)))) - (set gnt:experimentDesign + (set gnt:hasExperimentDesignInfo (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Datasets.ExperimentDesign USING latin1) USING utf8) AS VARCHAR(1500))" ExperimentDesign)))) - (set gnt:contributors + (set gdmt:hasContributorInfo (sanitize-rdf-string (field ("CAST(CONVERT(BINARY CONVERT(Datasets.Contributors USING latin1) USING utf8) AS VARCHAR(1500))" Contributors)))) - (set gnt:citation + (set gdmt:IsCitedBy (sanitize-rdf-string (regexp-substitute/global #f "^[Nn]one$" @@ -206,7 +220,7 @@ ("CAST(CONVERT(BINARY CONVERT(Datasets.Citation USING latin1) USING utf8) AS VARCHAR(1500))" Citation)) ""))) - (set gnt:dataSourceAcknowledgment + (set gnt:hasAcknowledgement (sanitize-rdf-string (string-trim-both (regexp-substitute/global @@ -214,7 +228,7 @@ (field ("CAST(CONVERT(BINARY CONVERT(InfoFiles.Data_Source_Acknowledge USING latin1) USING utf8) AS VARCHAR(1500))" Data_Source_Acknowledge)) "")))) - (set gnt:acknowledgment (sanitize-rdf-string + (set gnt:hasAcknowledgement (sanitize-rdf-string (field Datasets Acknowledgment))))) @@ -227,10 +241,14 @@ (prefixes '(("v:" "") ("foaf:" "") + ("gdmt:" "") + ("skos:" "") ("geoSeries:" "") ("gnt:" "") ("gn:" "") + ("gnc:" "") ("rdf:" "") + ("owl:" "") ("rdfs:" "") ("taxon:" "") ("dct:" ""))) -- cgit v1.2.3