From add95e737f61fdf3e8f244dd7ebedca963514bb7 Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Mon, 21 Aug 2023 14:41:23 +0300 Subject: Move dumps related to datasets to one place * examples/dump-dataset-metadata.scm: Add dump-gene-chip, dump-publishfreeze, dump-genofreeze, dump-probesetfreeze * examples/dump-genotype.scm: Delete dump-genofreeze. * examples/dump-phenotype.scm: Delete dump-publishfreeze. * examples/dump-probesetfreeze.scm: Delete file --- examples/dump-dataset-metadata.scm | 140 ++++++++++++++++++++++++++++++++++++- 1 file changed, 139 insertions(+), 1 deletion(-) (limited to 'examples/dump-dataset-metadata.scm') diff --git a/examples/dump-dataset-metadata.scm b/examples/dump-dataset-metadata.scm index e732772..8f381b7 100755 --- a/examples/dump-dataset-metadata.scm +++ b/examples/dump-dataset-metadata.scm @@ -56,6 +56,37 @@ (set v:postal-code (field Investigators ZipCode)) (set v:country-name (field Investigators Country)))) +(define-dump dump-gene-chip + (tables (GeneChip + (left-join Species "USING (SpeciesId)"))) + (schema-triples + (gnc:geneChip a skos:Concept) + (gnc:geneChip + skos:description + "This is a set of controlled terms that are used to describe a given gene chip/platform") + (gnt:hasGeoSeriesId rdfs:domain gnc:platform) + (gnt:belongsToSpecies a owl:ObjectProperty) + (gnt:belongsToSpecies skos:definition "This resource belongs to this given species") + (gnt:belongsToSpecies rdfs:domain gnc:geneChip) + (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip) + (gnt:hasGOTreeValue a owl:ObjectProperty) + (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value") + (gnt:hasGOTreeValue rdfs:domain gnc:geneChip)) + (triples (string->identifier "platform" (field GeneChip Name)) + (set rdf:type 'gnc:geneChip) + (set rdfs:label (field GeneChip GeneChipName)) + (set skos:prefLabel (field GeneChip Name)) + (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" + Title))) + (set gnt:hasGOTreeValue (field GeneChip Go_tree_value)) + (set gnt:belongsToSpecies + (string->identifier "" (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first)) + (set gnt:hasGeoSeriesId + (ontology 'geoSeries: + (string-trim-both (field GeneChip GeoPlatform)))))) + (define-dump dump-info-files (tables (InfoFiles (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name") @@ -219,6 +250,109 @@ (set gnt:hasAcknowledgement (sanitize-rdf-string (field Datasets Acknowledgment))))) +;; These are phenotype datasets that don't have Infofile metadata +(define-dump dump-publishfreeze + (tables (PublishFreeze + (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name") + (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) + "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") + (triples + (string->identifier + "" + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field PublishFreeze Name) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:phenotypeDataset) + (set rdfs:label (field PublishFreeze Name)) + (set skos:prefLabel (field PublishFreeze FullName)) + (set skos:altLabel (field PublishFreeze ShortName)) + (set dct:created (annotate-field + (field PublishFreeze CreateTime) + '^^xsd:date)) + (set gnt:belongsToSet + (string->identifier + "inbredSet" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)))) + +(define-dump dump-genofreeze + (tables (GenoFreeze + (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name") + (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) + "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") + (triples + (string->identifier + "" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field GenoFreeze Name) + 'pre "_" 'post) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:genotypeDataset) + (set rdfs:label (field GenoFreeze Name)) + (set skos:prefLabel (field GenoFreeze FullName)) + (set skos:altLabel (field GenoFreeze ShortName)) + (set dct:created (annotate-field + (field GenoFreeze CreateTime) + '^^xsd:date)) + (set gnt:belongsToSet + (string->identifier + "inbredSet" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)))) + +;; Molecular Traits are also referred to as ProbeSets +(define-dump dump-probesetfreeze + (tables (ProbeSetFreeze + (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") + (left-join ProbeFreeze "USING (ProbeFreezeId)") + (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") + (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id") + (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId")) + "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id") + (schema-triples + (gnt:usesNormalization rdfs:domain gnc:probeset) + (gnt:usesDataScale rdfs:domain gnc:probeset) + (gnt:usesDataScale a owl:ObjectProperty) + (gnt:usesDataScale skos:definition "Thi data scale this resource uses")) + (triples + (string->identifier + "" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field ProbeSetFreeze Name) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:probesetDataset) + (set gnt:usesNormalization + (string->identifier "avgmethod" + ;; If AvgMethodName is NULL, assume N/A. + (if (string-blank? (field AvgMethod Name AvgMethodName)) + "N/A" (field AvgMethod Name AvgMethodName)))) + (set dct:title (field ProbeSetFreeze FullName)) + (set rdfs:label (field ProbeSetFreeze ShortName)) + (set skos:prefLabel (field ProbeSetFreeze Name)) + (set skos:altLabel (field ProbeSetFreeze Name2)) + (set dct:created (annotate-field + (field ProbeSetFreeze CreateTime) + '^^xsd:datetime)) + (set gnt:usesDataScale (field ProbeSetFreeze DataScale)) + (set gnt:hasTissue + (string->identifier + "tissue" + (field Tissue Short_Name))) + (set gnt:belongsToSet + (string->identifier + "inbredSet" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)))) @@ -242,7 +376,11 @@ ("dct:" ""))) (inputs (list dump-info-files - dump-investigators)) + dump-publishfreeze + dump-genofreeze + dump-probesetfreeze + dump-investigators + dump-gene-chip)) (outputs '(#:documentation "./docs/dump-info-pages.md" #:rdf "/export/data/genenetwork-virtuoso/dump-info-pages.ttl"))) -- cgit v1.2.3