From 51b3c0548c98e0bc05e11a89cbf6b75d31b9f8d5 Mon Sep 17 00:00:00 2001 From: Munyoki Kilyungi Date: Mon, 21 Aug 2023 14:54:21 +0300 Subject: Remove "dump-" prefix Signed-off-by: Munyoki Kilyungi --- examples/species-metadata.scm | 226 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100755 examples/species-metadata.scm (limited to 'examples/species-metadata.scm') diff --git a/examples/species-metadata.scm b/examples/species-metadata.scm new file mode 100755 index 0000000..f3794b8 --- /dev/null +++ b/examples/species-metadata.scm @@ -0,0 +1,226 @@ +#! /usr/bin/env guile +!# + +(use-modules (srfi srfi-1) + (srfi srfi-26) + (ice-9 match) + (ice-9 regex) + (dump strings) + (dump sql) + (dump triples) + (dump special-forms)) + + + +(define %connection-settings + (call-with-input-file (list-ref (command-line) 1) + read)) + + + +(define (remap-species-identifiers str) + "This procedure remaps identifiers to standard binominal. Obviously this should + be sorted by correcting the database!" + (match str + ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] + ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] + ["Macaca mulatta" "Macaca nemestrina"] + ["Bat (Glossophaga soricina)" "Glossophaga soricina"] + [str str])) + +(define-transformer species + (tables (Species)) + (schema-triples + (gnc:species a skos:Concept) + (gnc:species skos:description "This is a set of controlled terms that are used to describe a given species") + (gnc:species skos:broader gnc:family) + (gnt:binomialName a owl:ObjectProperty) + (gnt:binomialName rdfs:domain gnc:species) + (gnt:family a owl:ObjectProperty) + (gnt:family rdfs:domain gnc:species) + (gnt:family skos:definition "This resource belongs to this family") + (gnt:organism a owl:ObjectProperty) + (gnt:organism rdfs:domain gnc:species) + (gnt:shortName a owl:ObjectProperty) + (gnt:shortName rdfs:domain gnc:species)) + (triples + (string->identifier "" (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:species) + (set skos:label (field Species SpeciesName)) + (set skos:altLabel (field Species Name)) + (set rdfs:label (field Species MenuName)) + (set gnt:binomialName (field Species FullName)) + (set gnt:family (field Species Family)) + (set gnt:organism (ontology 'taxon: (field Species TaxonomyId))))) + +#! + +The ProbeData table contains StrainID. + +MariaDB [db_webqtl]> select * from ProbeData limit 2; ++--------+----------+---------+ +| Id | StrainId | value | ++--------+----------+---------+ +| 503636 | 42 | 11.6906 | +| 503636 | 43 | 11.4205 | ++--------+----------+---------+ + +Likewise + +MariaDB [db_webqtl]> select * from ProbeSetData wher limit 2; ++----+----------+-------+ +| Id | StrainId | value | ++----+----------+-------+ +| 1 | 1 | 5.742 | +| 1 | 2 | 5.006 | ++----+----------+-------+ + +To get at the strain use + +MariaDB [db_webqtl]> select * from Strain where Id=1 limit 15; ++----+--------+--------+-----------+--------+-------+ +| Id | Name | Name2 | SpeciesId | Symbol | Alias | ++----+--------+--------+-----------+--------+-------+ +| 1 | B6D2F1 | B6D2F1 | 1 | NULL | NULL | ++----+--------+--------+-----------+--------+-------+ + +A typical query may look like + +SELECT Strain.Name, Strain.Id FROM Strain, Species +WHERE Strain.Name IN f{create_in_clause(self.samplelist)} +AND Strain.SpeciesId=Species.Id +AND Species.name = %s, (self.group.species,) + +At this point it is not very clear how Name, Name2, Symbol and Alias are used. + +!# + +(define-transformer strain + (tables (Strain + (left-join Species "ON Strain.SpeciesId = Species.SpeciesId"))) + (schema-triples + (gnc:strain skos:broader gnc:species) + (gnt:belongsToSpecies rdfs:domain gnc:strain) + (gnt:belongsToSpecies skos:definition "This resource belongs to this species") + (gnt:belongsToSpecies a owl:ObjectProperty) + (gnt:belongsToSpecies skos:definition "This resource belongs to this species") + (gnt:alias rdfs:domain gnc:strain) + (gnt:alias a owl:ObjectProperty) + (gnt:symbol rdfs:domain gnc:strain) + (gnt:symbol a owl:ObjectProperty)) + (triples (string->identifier + "" + (regexp-substitute/global + #f "[^A-Za-z0-9:]" + (field Strain Name) + 'pre "_" 'post) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:strain) + (set gnt:belongsToSpecies + (string->identifier "" (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first)) + ;; Name, and maybe a second name + (set rdfs:label (sanitize-rdf-string (field Strain Name))) + (set rdfs:label (sanitize-rdf-string (field ("IF ((Strain.Name2 != Strain.Name), Strain.Name2, '')" Name2)))) + (set gnt:alias (sanitize-rdf-string (field ("IF ((Strain.Alias != Strain.Name), Strain.Alias, '')" Alias)))) + (set gnt:symbol (field ("IF ((Strain.Symbol != Strain.Name), Strain.Symbol, '')" Symbol))))) + +(define-transformer mapping-method + (tables (MappingMethod)) + (schema-triples + (gnc:mappingMethod a skos:Concept) + (gnc:mappingMethod skos:definition "Terms that decribe mapping/normalization methods used in GeneNetwork")) + (triples + (string->identifier "mappingMethod" (field MappingMethod Name)) + (set rdf:type 'gnc:mappingMethod) + (set rdfs:label (field MappingMethod Name)))) + + +(define-transformer inbred-set + (tables (InbredSet + (left-join Species "ON InbredSet.SpeciesId=Species.Id") + (left-join MappingMethod + "ON InbredSet.MappingMethodId=MappingMethod.Id"))) + (schema-triples + (gnc:set skos:broader gnc:species) + (gnc:set skos:definition "A set of terms used to describe an set, which can be inbredSet, outbredSet etc etc.") + (gnt:geneticType a owl:ObjectProperty) + (gnt:geneticType rdfs:domain gnc:set) + (gnt:code a owl:ObjectProperty) + (gnt:code rdfs:domain gnc:set) + ;; Already defined as an owl prop in species + (gnt:family rdfs:domain gnc:set) + (gnt:phenotype a owl:ObjectProperty) + (gnt:phenotype rdfs:domain gnc:set) + (gnt:genotype a owl:ObjectProperty) + (gnt:genotype rdfs:domain gnt:inbredSet) + (gnt:mappingMethod a owl:ObjectProperty) + (gnt:mappingMethod rdfs:domain gnc:set)) + (triples (string->identifier + "set" (field InbredSet Name) + #:separator "" + #:proc string-capitalize-first) + (set rdf:type 'gnc:set) + (set rdfs:label (field InbredSet FullName)) + (set skos:altLabel (field InbredSet Name)) + (set gnt:geneticType (field InbredSet GeneticType)) + (set gnt:family (field InbredSet Family)) + (set gnt:mappingMethod (field MappingMethod Name)) + (set gnt:code (field InbredSet InbredSetCode)) + (set gnt:belongsToSpecies + (string->identifier "" (remap-species-identifiers (field Species Fullname)) + #:separator "" + #:proc string-capitalize-first)) + (set gnt:genotype + (field ("IF ((SELECT PublishFreeze.Name FROM PublishFreeze WHERE PublishFreeze.InbredSetId = InbredSet.Id LIMIT 1) IS NOT NULL, 'Traits and Cofactors', '')" genotypeP))) + (set gnt:phenotype + (field ("IF ((SELECT GenoFreeze.Name FROM GenoFreeze WHERE GenoFreeze.InbredSetId = InbredSet.Id LIMIT 1) IS NOT NULL, 'DNA Markers and SNPs', '')" phenotypeP))) + (multiset gnt:hasTissue + (map + (lambda (x) + (string->identifier "tissue" + x)) + (string-split-substring + (field ("(SELECT GROUP_CONCAT(DISTINCT Tissue.Short_Name SEPARATOR'||') AS MolecularTraits FROM ProbeFreeze, ProbeSetFreeze, InbredSet, Tissue, Species WHERE ProbeFreeze.TissueId = Tissue.Id AND ProbeFreeze.InbredSetId = InbredSet.Id AND ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id ORDER BY Tissue.Name)" + molecularTrait)) + "||"))))) + +(define-transformer avg-method + ;; The Name and Normalization fields seem to be the same. Dump only + ;; the Name field. + (tables (AvgMethod)) + (schema-triples + (gnc:avgMethod rdf:type owl:Class)) + (triples (string->identifier "avgmethod" (field AvgMethod Name)) + (set rdf:type 'gnc:avgMethod) + (set rdfs:label (field AvgMethod Normalization)))) + + + +(with-documentation + (name "Species Metadata") + (connection %connection-settings) + (table-metadata? #f) + (prefixes + '(("gn:" "") + ("gnc:" "") + ("owl:" "") + ("gnt:" "") + ("skos:" "") + ("rdf:" "") + ("rdfs:" "") + ("taxon:" ""))) + (inputs + (list + inbred-set + species + strain + mapping-method + avg-method)) + (outputs + '(#:documentation "./docs/species-metadata.md" + #:rdf "/export/data/genenetwork-virtuoso/species-metadata.ttl"))) -- cgit v1.2.3