From 51b3c0548c98e0bc05e11a89cbf6b75d31b9f8d5 Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Mon, 21 Aug 2023 14:54:21 +0300 Subject: Remove "dump-" prefix Signed-off-by: Munyoki Kilyungi --- examples/dataset-metadata.scm | 387 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100755 examples/dataset-metadata.scm (limited to 'examples/dataset-metadata.scm') diff --git a/examples/dataset-metadata.scm b/examples/dataset-metadata.scm new file mode 100755 index 0000000..5680a2b --- /dev/null +++ b/examples/dataset-metadata.scm @@ -0,0 +1,387 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 match) + (ice-9 regex) + (dump strings) + (dump sql) + (dump triples) + (dump special-forms)) + + + +(define %connection-settings + (call-with-input-file (list-ref (command-line) 1) + read)) + + + +;; One email ID in the Investigators table has spaces in it. This +;; function fixes that. +(define (fix-email-id email) + (string-delete #\space email)) + +(define (investigator-attributes->id first-name last-name email) + ;; There is just one record corresponding to "Evan Williams" which + ;; does not have an email ID. To accommodate that record, we + ;; construct the investigator ID from not just the email ID, but + ;; also the first and the last names. It would be preferable to just + ;; find Evan Williams' email ID and insert it into the database. + (string->identifier "investigator" + (string-join + (list first-name last-name (fix-email-id email)) + "_"))) + +(define-transformer investigators + ;; There are a few duplicate entries. We group by email to + ;; deduplicate. + (tables (Investigators) + "GROUP BY Email") + (triples (investigator-attributes->id (field Investigators FirstName) + (field Investigators LastName) + (field Investigators Email)) + (set rdf:type 'foaf:Person) + (set foaf:name (string-append (field Investigators FirstName) " " + (field Investigators LastName))) + (set foaf:givenName + (field Investigators FirstName)) + (set foaf:familyName + (field Investigators LastName)) + (set foaf:homepage (field Investigators Url)) + (set v:adr (field Investigators Address)) + (set v:locality (field Investigators City)) + (set v:region (field Investigators State)) + (set v:postal-code (field Investigators ZipCode)) + (set v:country-name (field Investigators Country)))) + +(define-transformer gene-chip + (tables (GeneChip + (left-join Species "USING (SpeciesId)"))) + (schema-triples + (gnc:geneChip a skos:Concept) + (gnc:geneChip + skos:description + "This is a set of controlled terms that are used to describe a given gene chip/platform") + (gnt:hasGeoSeriesId rdfs:domain gnc:platform) + (gnt:belongsToSpecies a owl:ObjectProperty) + (gnt:belongsToSpecies skos:definition "This resource belongs to this given species") + (gnt:belongsToSpecies rdfs:domain gnc:geneChip) + (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip) + (gnt:hasGOTreeValue a owl:ObjectProperty) + (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value") + (gnt:hasGOTreeValue rdfs:domain gnc:geneChip)) + (triples (string->identifier "platform" (field GeneChip Name)) + (set rdf:type 'gnc:geneChip) + (set rdfs:label (field GeneChip GeneChipName)) + (set skos:prefLabel (field GeneChip Name)) + (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" + Title))) + (set gnt:hasGOTreeValue (field GeneChip Go_tree_value)) + (set gnt:belongsToSpecies + (string->identifier "" (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first)) + (set gnt:hasGeoSeriesId + (ontology 'geoSeries: + (string-trim-both (field GeneChip GeoPlatform)))))) + +(define-transformer info-files + (tables (InfoFiles + (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name") + (left-join GenoFreeze "ON InfoFiles.InfoPageName = GenoFreeze.Name") + (left-join ProbeSetFreeze "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") + (left-join InbredSet "ON InfoFiles.InbredSetId = InbredSet.InbredSetId") + (left-join Species "ON InfoFiles.SpeciesId = Species.SpeciesId") + (left-join Datasets "USING (DatasetId)") + (left-join DatasetStatus "USING (DatasetStatusId)") + (left-join Tissue "USING (TissueId)") + (left-join Investigators "USING (InvestigatorId)") + (left-join AvgMethod "USING (AvgMethodId)") + (left-join Organizations "USING (OrganizationId)") + (left-join GeneChip "USING (GeneChipId)")) + "WHERE GN_AccesionId IS NOT NULL") + (schema-triples + (gnc:dataset rdf:type gdmt:Dataset) + (gnc:genotypeDataset rdfs:subClassOf gnc:dataset) + (gnc:phenotypeDataset rdfs:subClassOf gnc:dataset) + (gnc:probesetDataset rdfs:subClassOf gnc:dataset) + (gnt:belongsToSet rdfs:domain gnc:dataset) + (gnt:belongsToSet a owl:ObjectProperty) + (gnt:belongsToSet skos:definition "The InbredSet this resource belongs to") + (gnt:hasTissue rdfs:domain gnc:dataset) + (gnt:hasTissue a owl:ObjectProperty) + (gnt:hasTissue skos:definition "Tissues this resource has") + (gnt:hasTissueInfo rdfs:domain gnc:dataset) + (gnt:hasTissueInfo a owl:ObjectProperty) + (gnt:hasTissueInfo skos:definition "Metadata about Tissue for this resource") + (gnt:usesNormalization rdfs:domain gnc:dataset) + (gnt:usesNormalization a owl:ObjectProperty) + (gnt:usesNormalization skos:definition "Normalization techniques this resource has") + (gnt:usesPlatform rdfs:domain gnc:dataset) + (gnt:usesPlatform a owl:ObjectProperty) + (gnt:usesPlatform skos:definition "The Platform this resource uses") + (gnt:hasGeoSeriesId rdfs:domain gnc:dataset) + (gnt:hasGeoSeriesId a owl:ObjectProperty) + (gnt:hasGeoSeriesId skos:definition "id of record in NCBI database") + (gnt:hasExperimentDesignInfo rdfs:domain gnc:dataset) + (gnt:hasExperimentDesignInfo rdfs:label "Experiment Design") + (gnt:hasExperimentDesignInfo a owl:ObjectProperty) + (gnt:hasExperimentDesignInfo skos:definition "Information about how the experiment was designed") + (gnt:hasNotes rdfs:domain gnc:dataset) + (gnt:hasNotes a owl:ObjectProperty) + (gnt:hasNotes rdfs:label "Notes") + (gnt:hasNotes skos:definition "Extra Notes about this dataset") + (gnt:hasDataProcessingInfo rdfs:domain gnc:dataset) + (gnt:hasDataProcessingInfo rdfs:label "About Data Processing") + (gnt:hasDataProcessingInfo a owl:ObjectProperty) + (gnt:hasDataProcessingInfo skos:definition "Information about how this dataset was processed") + (gnt:hasPlatformInfo rdfs:domain gnc:dataset) + (gnt:hasPlatformInfo a owl:ObjectProperty) + (gnt:hasPlatformInfo rdfs:label "About Platfoorm") + (gnt:hasPlatformInfo skos:definition "Information about the platform that was used with this dataset") + (gnt:hasCaseInfo rdfs:domain gnc:dataset) + (gnt:hasCaseInfo rdfs:label "About Case") + (gnt:hasCaseInfo a owl:ObjectProperty) + (gnt:hasCaseInfo skos:definition "Information about the cases used in this platform") + (gnt:hasAcknowledgement rdfs:domain gnc:dataset) + (gnt:hasAcknowledgement rdfs:label "Acknowledgement") + (gnt:hasAcknowledgement a owl:ObjectProperty) + (gnt:hasAcknowledgement skos:definition "People to acknowledge")) + (triples (string->identifier + "" (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field InfoFiles InfoPageName) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type (string->symbol + (field ("IF(GenoFreeze.Id IS NOT NULL, 'gnc:genotypeDataset', IF(PublishFreeze.Id IS NOT NULL, 'gnc:phenotypeDataset', IF(ProbeSetFreeze.Name IS NOT NULL, 'gnc:probesetDataset', 'gnc:dataset')))" + rdfType)))) + (set rdfs:label (regexp-substitute/global + #f "^[Nn]one$" + (field InfoFiles InfoPageName) + "")) + (set skos:prefLabel + (field ("IFNULL(GenoFreeze.FullName, IFNULL(PublishFreeze.FullName, ''))" + DatasetFullName))) + (set skos:prefLabel (field Datasets DatasetName DatasetGroup)) + (set gdmt:hasTitleInfo + (regexp-substitute/global + #f "^[Nn]one$" + (field InfoFiles InfoFileTitle) + "")) + ;; This is the published title + (set dct:title + (regexp-substitute/global + #f "^[Nn]one$" + (field Datasets PublicationTitle) + "")) + (set dct:created + (field ("IFNULL(GenoFreeze.CreateTime, IFNULL(PublishFreeze.CreateTime, IFNULL(ProbeSetFreeze.CreateTime, '')))" + createTimeGenoFreeze))) + (set gdmt:hasCreatorInfo + (investigator-attributes->id (field Investigators FirstName) + (field Investigators LastName) + (field Investigators Email))) + (set gdmt:hasCreatorAffiliation + (field Organizations OrganizationName)) + (set gdmt:hasDatasetIdentifierSubType (format #f "GN~a" (field InfoFiles GN_AccesionId))) + (set gdmt:hasRightsInfo (string-downcase + (field DatasetStatus DatasetStatusName))) + (set gnt:belongsToSet + (string->identifier + "set" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)) + (set gnt:hasTissue (string->identifier "tissue" + (field Tissue Short_Name))) + (set gnt:usesNormalization + (string->identifier "avgmethod" + ;; If AvgMethodName is NULL, assume N/A. + (if (string-blank? (field AvgMethod Name AvgMethodName)) + "N/A" (field AvgMethod Name AvgMethodName)))) + (set gnt:usesPlatform + (string->identifier "platform" + (field GeneChip Name GeneChip))) + (set gdmt:isDescribedBy + (sanitize-rdf-string (field Datasets Summary))) + (set gnt:hasGeoSeriesId + (let ((s + (string-match "GSE[0-9]*" + (field ("IFNULL(Datasets.GeoSeries, '')" GeoSeries))))) + (if s (ontology + 'geoSeries: (match:substring s)) + ""))) + (set gnt:hasTissueInfo + (sanitize-rdf-string (field Datasets AboutTissue))) + (set gnt:hasContentInfo (sanitize-rdf-string (field InfoFiles Specifics))) + (set gnt:hasCaseInfo + (sanitize-rdf-string + (field Datasets AboutCases))) + (set gnt:hasPlatformInfo + (sanitize-rdf-string + (field Datasets AboutPlatform))) + (set gnt:hasDataProcessingInfo + (sanitize-rdf-string + (field Datasets AboutDataProcessing))) + (set gnt:hasNotes + (sanitize-rdf-string + (field Datasets Notes))) + (set gnt:hasExperimentDesignInfo + (sanitize-rdf-string + (field Datasets ExperimentDesign))) + (set gdmt:hasContributorInfo + (sanitize-rdf-string + (field Datasets Contributors))) + (set gdmt:IsCitedBy + (sanitize-rdf-string + (regexp-substitute/global + #f "^[Nn]one$" + (field Datasets Citation) + ""))) + (set gnt:hasAcknowledgement + (sanitize-rdf-string + (string-trim-both + (regexp-substitute/global + #f "^[Nn]one$" + (field InfoFiles Data_Source_Acknowledge) + "")))) + (set gnt:hasAcknowledgement (sanitize-rdf-string + (field Datasets Acknowledgment))))) + +;; These are phenotype datasets that don't have Infofile metadata +(define-transformer publishfreeze + (tables (PublishFreeze + (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name") + (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) + "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") + (triples + (string->identifier + "" + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field PublishFreeze Name) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:phenotypeDataset) + (set rdfs:label (field PublishFreeze Name)) + (set skos:prefLabel (field PublishFreeze FullName)) + (set skos:altLabel (field PublishFreeze ShortName)) + (set dct:created (annotate-field + (field PublishFreeze CreateTime) + '^^xsd:date)) + (set gnt:belongsToSet + (string->identifier + "set" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)))) + +(define-transformer genofreeze + (tables (GenoFreeze + (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name") + (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) + "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") + (triples + (string->identifier + "" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field GenoFreeze Name) + 'pre "_" 'post) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:genotypeDataset) + (set rdfs:label (field GenoFreeze Name)) + (set skos:prefLabel (field GenoFreeze FullName)) + (set skos:altLabel (field GenoFreeze ShortName)) + (set dct:created (annotate-field + (field GenoFreeze CreateTime) + '^^xsd:date)) + (set gnt:belongsToSet + (string->identifier + "set" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)))) + +;; Molecular Traits are also referred to as ProbeSets +(define-transformer probesetfreeze + (tables (ProbeSetFreeze + (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") + (left-join ProbeFreeze "USING (ProbeFreezeId)") + (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") + (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id") + (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId")) + "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id") + (schema-triples + (gnt:usesNormalization rdfs:domain gnc:probeset) + (gnt:usesDataScale rdfs:domain gnc:probeset) + (gnt:usesDataScale a owl:ObjectProperty) + (gnt:usesDataScale skos:definition "Thi data scale this resource uses")) + (triples + (string->identifier + "" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field ProbeSetFreeze Name) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:probesetDataset) + (set gnt:usesNormalization + (string->identifier "avgmethod" + ;; If AvgMethodName is NULL, assume N/A. + (if (string-blank? (field AvgMethod Name AvgMethodName)) + "N/A" (field AvgMethod Name AvgMethodName)))) + (set dct:title (field ProbeSetFreeze FullName)) + (set rdfs:label (field ProbeSetFreeze ShortName)) + (set skos:prefLabel (field ProbeSetFreeze Name)) + (set skos:altLabel (field ProbeSetFreeze Name2)) + (set dct:created (annotate-field + (field ProbeSetFreeze CreateTime) + '^^xsd:datetime)) + (set gnt:usesDataScale (field ProbeSetFreeze DataScale)) + (set gnt:hasTissue + (string->identifier + "tissue" + (field Tissue Short_Name))) + (set gnt:belongsToSet + (string->identifier + "set" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)))) + + + +(with-documentation + (name "Info files / Investigators Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("v:" "") + ("foaf:" "") + ("gdmt:" "") + ("skos:" "") + ("geoSeries:" "") + ("gnt:" "") + ("gn:" "") + ("gnc:" "") + ("rdf:" "") + ("owl:" "") + ("rdfs:" "") + ("taxon:" "") + ("dct:" ""))) + (inputs + (list info-files + publishfreeze + genofreeze + probesetfreeze + investigators + gene-chip)) + (outputs + '(#:documentation "./docs/info-pages.md" + #:rdf "/export/data/genenetwork-virtuoso/info-pages.ttl"))) + -- cgit v1.2.3