diff options
author | Munyoki Kilyungi | 2023-08-21 14:41:23 +0300 |
---|---|---|
committer | Munyoki Kilyungi | 2023-08-21 14:41:23 +0300 |
commit | add95e737f61fdf3e8f244dd7ebedca963514bb7 (patch) | |
tree | 667b16a1a9081b4fc445b4c6cf1c2caa76a4f1ab /examples | |
parent | e25db8edf8615f59d0682841fde8d43367ebfa53 (diff) | |
download | gn-transform-databases-add95e737f61fdf3e8f244dd7ebedca963514bb7.tar.gz |
Move dumps related to datasets to one place
* examples/dump-dataset-metadata.scm: Add dump-gene-chip,
dump-publishfreeze, dump-genofreeze, dump-probesetfreeze
* examples/dump-genotype.scm: Delete dump-genofreeze.
* examples/dump-phenotype.scm: Delete dump-publishfreeze.
* examples/dump-probesetfreeze.scm: Delete file
Diffstat (limited to 'examples')
-rwxr-xr-x | examples/dump-dataset-metadata.scm | 140 | ||||
-rwxr-xr-x | examples/dump-genotype.scm | 34 | ||||
-rwxr-xr-x | examples/dump-phenotype.scm | 29 | ||||
-rwxr-xr-x | examples/dump-probesetfreeze.scm | 131 |
4 files changed, 140 insertions, 194 deletions
diff --git a/examples/dump-dataset-metadata.scm b/examples/dump-dataset-metadata.scm index e732772..8f381b7 100755 --- a/examples/dump-dataset-metadata.scm +++ b/examples/dump-dataset-metadata.scm @@ -56,6 +56,37 @@ (set v:postal-code (field Investigators ZipCode)) (set v:country-name (field Investigators Country)))) +(define-dump dump-gene-chip + (tables (GeneChip + (left-join Species "USING (SpeciesId)"))) + (schema-triples + (gnc:geneChip a skos:Concept) + (gnc:geneChip + skos:description + "This is a set of controlled terms that are used to describe a given gene chip/platform") + (gnt:hasGeoSeriesId rdfs:domain gnc:platform) + (gnt:belongsToSpecies a owl:ObjectProperty) + (gnt:belongsToSpecies skos:definition "This resource belongs to this given species") + (gnt:belongsToSpecies rdfs:domain gnc:geneChip) + (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip) + (gnt:hasGOTreeValue a owl:ObjectProperty) + (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value") + (gnt:hasGOTreeValue rdfs:domain gnc:geneChip)) + (triples (string->identifier "platform" (field GeneChip Name)) + (set rdf:type 'gnc:geneChip) + (set rdfs:label (field GeneChip GeneChipName)) + (set skos:prefLabel (field GeneChip Name)) + (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" + Title))) + (set gnt:hasGOTreeValue (field GeneChip Go_tree_value)) + (set gnt:belongsToSpecies + (string->identifier "" (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first)) + (set gnt:hasGeoSeriesId + (ontology 'geoSeries: + (string-trim-both (field GeneChip GeoPlatform)))))) + (define-dump dump-info-files (tables (InfoFiles (left-join PublishFreeze "ON InfoFiles.InfoPageName = PublishFreeze.Name") @@ -219,6 +250,109 @@ (set gnt:hasAcknowledgement (sanitize-rdf-string (field Datasets Acknowledgment))))) +;; These are phenotype datasets that don't have Infofile metadata +(define-dump dump-publishfreeze + (tables (PublishFreeze + (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name") + (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) + "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") + (triples + (string->identifier + "" + (regexp-substitute/global #f "[^A-Za-z0-9:]" + (field PublishFreeze Name) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:phenotypeDataset) + (set rdfs:label (field PublishFreeze Name)) + (set skos:prefLabel (field PublishFreeze FullName)) + (set skos:altLabel (field PublishFreeze ShortName)) + (set dct:created (annotate-field + (field PublishFreeze CreateTime) + '^^xsd:date)) + (set gnt:belongsToSet + (string->identifier + "inbredSet" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)))) + +(define-dump dump-genofreeze + (tables (GenoFreeze + (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name") + (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) + "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") + (triples + (string->identifier + "" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field GenoFreeze Name) + 'pre "_" 'post) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:genotypeDataset) + (set rdfs:label (field GenoFreeze Name)) + (set skos:prefLabel (field GenoFreeze FullName)) + (set skos:altLabel (field GenoFreeze ShortName)) + (set dct:created (annotate-field + (field GenoFreeze CreateTime) + '^^xsd:date)) + (set gnt:belongsToSet + (string->identifier + "inbredSet" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)))) + +;; Molecular Traits are also referred to as ProbeSets +(define-dump dump-probesetfreeze + (tables (ProbeSetFreeze + (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") + (left-join ProbeFreeze "USING (ProbeFreezeId)") + (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") + (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id") + (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId")) + "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id") + (schema-triples + (gnt:usesNormalization rdfs:domain gnc:probeset) + (gnt:usesDataScale rdfs:domain gnc:probeset) + (gnt:usesDataScale a owl:ObjectProperty) + (gnt:usesDataScale skos:definition "Thi data scale this resource uses")) + (triples + (string->identifier + "" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field ProbeSetFreeze Name) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:probesetDataset) + (set gnt:usesNormalization + (string->identifier "avgmethod" + ;; If AvgMethodName is NULL, assume N/A. + (if (string-blank? (field AvgMethod Name AvgMethodName)) + "N/A" (field AvgMethod Name AvgMethodName)))) + (set dct:title (field ProbeSetFreeze FullName)) + (set rdfs:label (field ProbeSetFreeze ShortName)) + (set skos:prefLabel (field ProbeSetFreeze Name)) + (set skos:altLabel (field ProbeSetFreeze Name2)) + (set dct:created (annotate-field + (field ProbeSetFreeze CreateTime) + '^^xsd:datetime)) + (set gnt:usesDataScale (field ProbeSetFreeze DataScale)) + (set gnt:hasTissue + (string->identifier + "tissue" + (field Tissue Short_Name))) + (set gnt:belongsToSet + (string->identifier + "inbredSet" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first)))) @@ -242,7 +376,11 @@ ("dct:" "<http://purl.org/dc/terms/>"))) (inputs (list dump-info-files - dump-investigators)) + dump-publishfreeze + dump-genofreeze + dump-probesetfreeze + dump-investigators + dump-gene-chip)) (outputs '(#:documentation "./docs/dump-info-pages.md" #:rdf "/export/data/genenetwork-virtuoso/dump-info-pages.ttl"))) diff --git a/examples/dump-genotype.scm b/examples/dump-genotype.scm index 04f1af0..30e7796 100755 --- a/examples/dump-genotype.scm +++ b/examples/dump-genotype.scm @@ -30,37 +30,6 @@ ["Bat (Glossophaga soricina)" "Glossophaga soricina"] [str str])) -(define-dump dump-genofreeze - (tables (GenoFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name") - (left-join InbredSet "ON GenoFreeze.InbredSetId = InbredSet.InbredSetId")) - "WHERE GenoFreeze.public > 0 AND GenoFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL") - (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field GenoFreeze Name) - 'pre "_" 'post) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type 'gnc:genotypeDataset) - (set rdfs:label (field GenoFreeze Name)) - (set skos:prefLabel (field GenoFreeze FullName)) - (set skos:altLabel (field GenoFreeze ShortName)) - (set dct:created (annotate-field - (field GenoFreeze CreateTime) - '^^xsd:date)) - (set gnt:belongsToSet - (string->identifier - "inbredSet" (field InbredSet Name) - #:separator "" - #:proc string-capitalize-first)))) - - (define-dump dump-genotypes (tables (Geno (left-join Species "USING (SpeciesId)"))) @@ -149,8 +118,7 @@ ("skos:" "<http://www.w3.org/2004/02/skos/core#>") ("xsd:" "<http://www.w3.org/2001/XMLSchema#>"))) (inputs - (list dump-genofreeze - dump-genotypes)) + (list dump-genotypes)) (outputs '(#:documentation "./docs/dump-genotype.md" #:rdf "/export/data/genenetwork-virtuoso/dump-genotype.ttl"))) diff --git a/examples/dump-phenotype.scm b/examples/dump-phenotype.scm index f5a8c3c..6c52ebb 100755 --- a/examples/dump-phenotype.scm +++ b/examples/dump-phenotype.scm @@ -19,34 +19,6 @@ read)) - -;; These are phenotype datasets that don't have Infofile metadata -(define-dump dump-publishfreeze - (tables (PublishFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name") - (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId")) - "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL") - (triples - (string->identifier - "" - (regexp-substitute/global #f "[^A-Za-z0-9:]" - (field PublishFreeze Name) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type 'gnc:phenotypeDataset) - (set rdfs:label (field PublishFreeze Name)) - (set skos:prefLabel (field PublishFreeze FullName)) - (set skos:altLabel (field PublishFreeze ShortName)) - (set dct:created (annotate-field - (field PublishFreeze CreateTime) - '^^xsd:date)) - (set gnt:belongsToSet - (string->identifier - "inbredSet" (field InbredSet Name) - #:separator "" - #:proc string-capitalize-first)))) - (define-dump dump-phenotypes (tables (PublishXRef (left-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId") @@ -147,7 +119,6 @@ ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>"))) (inputs (list - dump-publishfreeze dump-phenotypes)) (outputs '(#:documentation "./docs/dump-phenotype.md" diff --git a/examples/dump-probesetfreeze.scm b/examples/dump-probesetfreeze.scm deleted file mode 100755 index 50307bf..0000000 --- a/examples/dump-probesetfreeze.scm +++ /dev/null @@ -1,131 +0,0 @@ -#! /usr/bin/env guile -!# - -(use-modules (srfi srfi-1) - (srfi srfi-26) - (ice-9 match) - (ice-9 regex) - (dump strings) - (dump sql) - (dump triples) - (dump special-forms)) - - - -(define %connection-settings - (call-with-input-file (list-ref (command-line) 1) - read)) - - -(define (remap-species-identifiers str) - "This procedure remaps identifiers to standard binominal. Obviously this should - be sorted by correcting the database!" - (match str - ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] - ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] - ["Macaca mulatta" "Macaca nemestrina"] - ["Bat (Glossophaga soricina)" "Glossophaga soricina"] - [str str])) - - -(define-dump dump-gene-chip - (tables (GeneChip - (left-join Species "USING (SpeciesId)"))) - (schema-triples - (gnc:geneChip a skos:Concept) - (gnc:geneChip - skos:description - "This is a set of controlled terms that are used to describe a given gene chip/platform") - (gnt:hasGeoSeriesId rdfs:domain gnc:platform) - (gnt:belongsToSpecies a owl:ObjectProperty) - (gnt:belongsToSpecies skos:definition "This resource belongs to this given species") - (gnt:belongsToSpecies rdfs:domain gnc:geneChip) - (gnt:hasGeoSeriesId rdfs:domain gnc:geneChip) - (gnt:hasGOTreeValue a owl:ObjectProperty) - (gnt:hasGOTreeValue skos:definition "This resource the following GO tree value") - (gnt:hasGOTreeValue rdfs:domain gnc:geneChip)) - (triples (string->identifier "platform" (field GeneChip Name)) - (set rdf:type 'gnc:geneChip) - (set rdfs:label (field GeneChip GeneChipName)) - (set skos:prefLabel (field GeneChip Name)) - (set skos:altLabel (field ("IF(GeneChip.GeneChipName != GeneChip.Title, Title, NULL)" - Title))) - (set gnt:hasGOTreeValue (field GeneChip Go_tree_value)) - (set gnt:belongsToSpecies - (string->identifier "" (remap-species-identifiers (field Species Fullname)) - #:separator "" - #:proc string-capitalize-first)) - (set gnt:hasGeoSeriesId - (ontology 'geoSeries: - (string-trim-both (field GeneChip GeoPlatform)))))) - -;; Molecular Traits are also referred to as ProbeSets -(define-dump dump-probesetfreeze - (tables (ProbeSetFreeze - (left-join InfoFiles "ON InfoFiles.InfoPageName = ProbeSetFreeze.Name") - (left-join ProbeFreeze "USING (ProbeFreezeId)") - (left-join AvgMethod "ON AvgMethod.AvgMethodId = ProbeSetFreeze.AvgID") - (left-join InbredSet "ON ProbeFreeze.InbredSetId = InbredSet.Id") - (left-join Tissue "ON ProbeFreeze.TissueId = Tissue.TissueId")) - "WHERE ProbeSetFreeze.public > 0 AND InfoFiles.InfoPageName IS NULL GROUP BY ProbeFreeze.Id") - (schema-triples - (gnt:usesNormalization rdfs:domain gnc:probeset) - (gnt:usesDataScale rdfs:domain gnc:probeset) - (gnt:usesDataScale a owl:ObjectProperty) - (gnt:usesDataScale skos:definition "Thi data scale this resource uses")) - (triples - (string->identifier - "" - (regexp-substitute/global - #f "[^A-Za-z0-9:]" - (field ProbeSetFreeze Name) - 'pre "_" 'post) - #:separator "" - #:proc string-capitalize-first) - (set rdf:type 'gnc:probesetDataset) - (set gnt:usesNormalization - (string->identifier "avgmethod" - ;; If AvgMethodName is NULL, assume N/A. - (if (string-blank? (field AvgMethod Name AvgMethodName)) - "N/A" (field AvgMethod Name AvgMethodName)))) - (set dct:title (field ProbeSetFreeze FullName)) - (set rdfs:label (field ProbeSetFreeze ShortName)) - (set skos:prefLabel (field ProbeSetFreeze Name)) - (set skos:altLabel (field ProbeSetFreeze Name2)) - (set dct:created (annotate-field - (field ProbeSetFreeze CreateTime) - '^^xsd:datetime)) - (set gnt:usesDataScale (field ProbeSetFreeze DataScale)) - (set gnt:hasTissue - (string->identifier - "tissue" - (field Tissue Short_Name))) - (set gnt:belongsToSet - (string->identifier - "inbredSet" (field InbredSet Name) - #:separator "" - #:proc string-capitalize-first)))) - - - -(dump-with-documentation - (name "Probeset freeze metadata") - (connection %connection-settings) - (table-metadata? #f) - (prefixes - '(("geoSeries:" "<http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=>") - ("gn:" "<http://genenetwork.org/id/>") - ("gnc:" "<http://genenetwork.org/category/>") - ("dct:" "<http://purl.org/dc/terms/>") - ("owl:" "<http://www.w3.org/2002/07/owl#>") - ("skos:" "<http://www.w3.org/2004/02/skos/core#>") - ("gnt:" "<http://genenetwork.org/term/>") - ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>") - ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>") - ("xsd:" "<http://www.w3.org/2001/XMLSchema#>"))) - (inputs - (list dump-gene-chip - dump-probesetfreeze)) - (outputs - '(#:documentation "./docs/dump-gene-chip.md" - #:rdf "./verified-data/dump-probesetfreeze.ttl"))) |