about summary refs log tree commit diff
diff options
context:
space:
mode:
authorMunyoki Kilyungi2023-08-25 18:07:34 +0300
committerMunyoki Kilyungi2023-08-28 12:38:35 +0300
commitc9a84ecb21c7fcfdb9e5d277bd2a1c43b73f4f9f (patch)
tree2ad44af50df01ab518d0d4b9c030cb81c4d94f9e
parent91845d31ef4dd291bef12fa6f9a2755285de4739 (diff)
downloadgn-transform-databases-c9a84ecb21c7fcfdb9e5d277bd2a1c43b73f4f9f.tar.gz
Add new file with classification hierarchy ontology
Signed-off-by: Munyoki Kilyungi <me@bonfacemunyoki.com>
-rwxr-xr-xexamples/classification.scm152
-rwxr-xr-xexamples/species-metadata.scm77
2 files changed, 152 insertions, 77 deletions
diff --git a/examples/classification.scm b/examples/classification.scm
new file mode 100755
index 0000000..64aeef3
--- /dev/null
+++ b/examples/classification.scm
@@ -0,0 +1,152 @@
+#! /usr/bin/env guile
+!#
+
+(use-modules (srfi srfi-1)
+             (srfi srfi-26)
+             (ice-9 getopt-long)
+             (ice-9 match)
+             (ice-9 regex)
+             (transform strings)
+             (transform sql)
+             (transform triples)
+             (transform special-forms))
+
+
+
+(define (remap-species-identifiers str)
+  "This procedure remaps identifiers to standard binominal. Obviously this should
+   be sorted by correcting the database!"
+  (match str
+    ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"]
+    ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"]
+    ["Macaca mulatta" "Macaca nemestrina"]
+    ["Bat (Glossophaga soricina)" "Glossophaga soricina"]
+    [str str]))
+
+;; Classification Scheme
+(define-transformer classification-scheme-species
+  (tables (Species))
+  (schema-triples
+   (gnc:ResourceClassificationScheme a skos:ConceptScheme)
+   (gnc:ResourceClassificationScheme skos:prefLabel "GeneNetwork Classification Scheme For Resources")
+   (gnc:ResourceClassificationScheme xkos:numberOfLevels "3")
+   (gnc:ResourceClassificationScheme xkos:levels "( gnc:Type gnc:Set gnc:Species )")
+   (gnc:Type a xkos:ClassificationLevel)
+   (gnc:Type skos:prefLabel "The Type of a Dataset which can be a ProbeSet, Genotype, or Phenotype")
+   (gnc:Type xkos:depth "1")
+   (gnc:Type skos:member gn:ProbeSet)
+   (gnc:Type skos:member gn:Genotype)
+   (gn:Genotype skos:altLabel "DNA Markers and SNPs")
+   (gn:Genotype skos:prefLabel "Genotype")
+   (gnc:Type skos:member gn:Phenotype)
+   (gn:Phenotype skos:prefLabel "Phenotype")
+   (gn:Phenotype skos:altLabel "Traits and Cofactors")
+   (gnc:Species a xkos:ClassificationLevel)
+   (gnc:Species skos:prefLabel "The species in which this resource belongs")
+   (gnc:Species xkos:depth "3")
+   (gnc:Species xkos:specializes gnc:Set))
+  (triples "gnc:Species"
+    (set skos:member (string->identifier "" (remap-species-identifiers (field Species Fullname))
+                          #:separator ""
+                          #:proc string-capitalize-first))))
+
+(define-transformer classification-scheme-set
+  (tables (InbredSet))
+  (schema-triples
+   (gnc:Set a xkos:ClassificationLevel)
+   (gnc:Set skos:prefLabel "The Type of Set, Ie InbredSet/OutbredSet that a resource can belong to")
+   (gnc:Set xkos:depth "2")
+   (gnc:Set xkos:generalizes gnc:Species))
+  (triples "gnc:Set"
+    (set skos:member
+         (string->identifier
+          "set" (field InbredSet Name)
+          #:separator ""
+          #:proc string-capitalize-first))))
+
+(define-transformer species
+  (tables (Species))
+  (schema-triples
+   (gnt:family a owl:ObjectProperty)
+   (gnt:family rdfs:domain gnc:Species)
+   (gnt:family skos:definition "This resource belongs to this family"))
+  (triples
+      (string->identifier "" (remap-species-identifiers (field Species Fullname))
+                          #:separator ""
+                          #:proc string-capitalize-first)
+    (set skos:inScheme 'gnc:ResourceClassificationScheme)
+    (set rdfs:label (field Species FullName))
+    (set skos:prefLabel (field Species MenuName))
+    (set skos:altLabel (field Species SpeciesName))
+    (set skos:altLabel (field Species Name))
+    (set gnt:family (field Species Family))
+    (set skos:notation (ontology
+                         'taxon:
+                         (field Species TaxonomyId)))))
+
+(define-transformer inbred-set
+  (tables (InbredSet
+           (left-join Species "ON InbredSet.SpeciesId=Species.Id")
+           (left-join MappingMethod
+                      "ON InbredSet.MappingMethodId=MappingMethod.Id")))
+  (schema-triples
+   (gnt:geneticType a owl:ObjectProperty)
+   (gnt:geneticType rdfs:domain gnc:set)
+   (gnt:code a owl:ObjectProperty)
+   (gnt:code rdfs:domain gnc:set)
+   ;; Already defined as an owl prop in species
+   (gnt:family rdfs:domain gnc:Set)
+   (gnt:mappingMethod a owl:ObjectProperty)
+   (gnt:mappingMethod rdfs:domain gnc:set))
+  (triples (string->identifier
+            "set" (field InbredSet Name)
+            #:separator ""
+            #:proc string-capitalize-first)
+    (set skos:inScheme 'gnc:ResourceClassificationScheme)
+    (set rdfs:label (field InbredSet FullName))
+    (set skos:prefLabel (field InbredSet Name))
+    (set gnt:geneticType (field InbredSet GeneticType))
+    (set gnt:family (field InbredSet Family))
+    (set gnt:mappingMethod (field MappingMethod Name))
+    (set gnt:code (field InbredSet InbredSetCode))
+    (set xkos:generalizes
+         (string->identifier "" (remap-species-identifiers (field Species Fullname))
+                             #:separator ""
+                             #:proc string-capitalize-first))))
+
+
+
+(let* ((option-spec
+        '((settings (single-char #\s) (value #t))
+          (output (single-char #\o) (value #t))
+          (documentation (single-char #\d) (value #t))))
+       (options (getopt-long (command-line) option-spec))
+       (settings (option-ref options 'settings #f))
+       (output (option-ref options 'output #f))
+       (documentation (option-ref options 'documentation #f))
+       (%connection-settings
+        (call-with-input-file settings
+          read)))
+
+  (with-documentation
+   (name "Species Metadata")
+   (connection %connection-settings)
+   (table-metadata? #f)
+   (prefixes
+    '(("gn:" "<http://genenetwork.org/id/>")
+      ("gnc:" "<http://genenetwork.org/category/>")
+      ("owl:" "<http://www.w3.org/2002/07/owl#>")
+      ("gnt:" "<http://genenetwork.org/term/>")
+      ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
+      ("xkos:" "<http://rdf-vocabulary.ddialliance.org/xkos#>")
+      ("rdf:" "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>")
+      ("rdfs:" "<http://www.w3.org/2000/01/rdf-schema#>")
+      ("taxon:" "<http://purl.uniprot.org/taxonomy/>")))
+   (inputs
+    (list classification-scheme-species
+          classification-scheme-set
+          species
+          inbred-set))
+   (outputs
+    `(#:documentation ,documentation
+      #:rdf ,output))))
diff --git a/examples/species-metadata.scm b/examples/species-metadata.scm
index b330b12..89f2e27 100755
--- a/examples/species-metadata.scm
+++ b/examples/species-metadata.scm
@@ -21,33 +21,6 @@
     ["Bat (Glossophaga soricina)" "Glossophaga soricina"]
     [str str]))
 
-(define-transformer species
-  (tables (Species))
-  (schema-triples
-   (gnc:species a skos:Concept)
-   (gnc:species skos:description "This is a set of controlled terms that are used to describe a given species")
-   (gnc:species skos:broader gnc:family)
-   (gnt:binomialName a owl:ObjectProperty)
-   (gnt:binomialName rdfs:domain gnc:species)
-   (gnt:family a owl:ObjectProperty)
-   (gnt:family rdfs:domain gnc:species)
-   (gnt:family skos:definition "This resource belongs to this family")
-   (gnt:organism a owl:ObjectProperty)
-   (gnt:organism rdfs:domain gnc:species)
-   (gnt:shortName a owl:ObjectProperty)
-   (gnt:shortName rdfs:domain gnc:species))
-  (triples
-      (string->identifier "" (remap-species-identifiers (field Species Fullname))
-                          #:separator ""
-                          #:proc string-capitalize-first)
-    (set rdf:type 'gnc:species)
-    (set skos:label (field Species SpeciesName))
-    (set skos:altLabel (field Species Name))
-    (set rdfs:label (field Species MenuName))
-    (set gnt:binomialName (field Species FullName))
-    (set gnt:family (field Species Family))
-    (set gnt:organism (ontology 'taxon: (field Species TaxonomyId)))))
-
 #!
 
 The ProbeData table contains StrainID.
@@ -132,56 +105,6 @@ At this point it is not very clear how Name, Name2, Symbol and Alias are used.
     (set rdf:type 'gnc:mappingMethod)
     (set rdfs:label (field MappingMethod Name))))
 
-
-(define-transformer inbred-set
-  (tables (InbredSet
-           (left-join Species "ON InbredSet.SpeciesId=Species.Id")
-           (left-join MappingMethod
-                      "ON InbredSet.MappingMethodId=MappingMethod.Id")))
-  (schema-triples
-   (gnc:set skos:broader gnc:species)
-   (gnc:set skos:definition "A set of terms used to describe an set, which can be inbredSet, outbredSet etc etc.")
-   (gnt:geneticType a owl:ObjectProperty)
-   (gnt:geneticType rdfs:domain gnc:set)
-   (gnt:code a owl:ObjectProperty)
-   (gnt:code rdfs:domain gnc:set)
-   ;; Already defined as an owl prop in species
-   (gnt:family rdfs:domain gnc:set)
-   (gnt:phenotype a owl:ObjectProperty)
-   (gnt:phenotype rdfs:domain gnc:set)
-   (gnt:genotype a owl:ObjectProperty)
-   (gnt:genotype rdfs:domain gnt:inbredSet)
-   (gnt:mappingMethod a owl:ObjectProperty)
-   (gnt:mappingMethod rdfs:domain gnc:set))
-  (triples (string->identifier
-            "set" (field InbredSet Name)
-            #:separator ""
-            #:proc string-capitalize-first)
-    (set rdf:type 'gnc:set)
-    (set rdfs:label (field InbredSet FullName))
-    (set skos:altLabel (field InbredSet Name))
-    (set gnt:geneticType (field InbredSet GeneticType))
-    (set gnt:family (field InbredSet Family))
-    (set gnt:mappingMethod (field MappingMethod Name))
-    (set gnt:code (field InbredSet InbredSetCode))
-    (set gnt:belongsToSpecies
-         (string->identifier "" (remap-species-identifiers (field Species Fullname))
-                             #:separator ""
-                             #:proc string-capitalize-first))
-    (set gnt:genotype
-         (field ("IF ((SELECT PublishFreeze.Name FROM PublishFreeze WHERE PublishFreeze.InbredSetId = InbredSet.Id LIMIT 1) IS NOT NULL, 'Traits and Cofactors', '')" genotypeP)))
-    (set gnt:phenotype
-         (field ("IF ((SELECT GenoFreeze.Name FROM GenoFreeze WHERE GenoFreeze.InbredSetId = InbredSet.Id LIMIT 1) IS NOT NULL, 'DNA Markers and SNPs', '')" phenotypeP)))
-    (multiset gnt:hasTissue
-              (map
-               (lambda (x)
-                 (string->identifier "tissue"
-                                     x))
-               (string-split-substring
-                (field ("(SELECT GROUP_CONCAT(DISTINCT Tissue.Short_Name SEPARATOR'||') AS MolecularTraits FROM ProbeFreeze, ProbeSetFreeze, InbredSet, Tissue, Species WHERE ProbeFreeze.TissueId = Tissue.Id AND ProbeFreeze.InbredSetId = InbredSet.Id AND ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id ORDER BY Tissue.Name)"
-                        molecularTrait))
-                "||")))))
-
 (define-transformer avg-method
   ;; The Name and Normalization fields seem to be the same. Dump only
   ;; the Name field.