about summary refs log tree commit diff
diff options
context:
space:
mode:
-rwxr-xr-xexamples/dump-phenotype.scm88
1 files changed, 54 insertions, 34 deletions
diff --git a/examples/dump-phenotype.scm b/examples/dump-phenotype.scm
index cd6ca95..19a8892 100755
--- a/examples/dump-phenotype.scm
+++ b/examples/dump-phenotype.scm
@@ -20,12 +20,12 @@
 
 
 
-;; Only dump publish freeze entries that were not dumped from the InfoFiles page
+;; These are phenotype datasets that don't have Infofile metadata
 (define-dump dump-publishfreeze
   (tables (PublishFreeze
            (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name")
            (left-join InbredSet "ON PublishFreeze.InbredSetId = InbredSet.InbredSetId"))
-          "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoPageName IS NULL")
+          "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND InfoFiles.InfoFileId IS NULL")
   (triples
       (string->identifier
        ""
@@ -51,27 +51,32 @@
   (tables (Phenotype
            (left-join PublishXRef "ON Phenotype.Id = PublishXRef.PhenotypeId")
            (left-join Publication "ON Publication.Id = PublishXRef.PublicationId")
+           ;; We need this join so as to construct the trait's skos:altLabel
+           (left-join InbredSet "ON InbredSet.InbredSetId = PublishXRef.InbredSetId")
            (left-join PublishFreeze "ON PublishFreeze.InbredSetId = PublishXRef.InbredSetId")
-           (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name")))
+           (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name"))
+          ;; Only dump public traits; Ignore "hanging" traits
+          ;; I.e. traits that have no associated vectors
+          "WHERE PublishFreeze.public > 0 AND PublishFreeze.confidentiality < 1 AND PublishFreeze.Id IS NOT NULL")
   (schema-triples
-   (gnt:originalDescription a owl:ObjectProperty)
-   (gnt:originalDescription rdfs:domain gnc:phenotype)
-   (gnt:originalDescription skos:definition "The original description of this resource")
-   (gnt:prePublicationDescription a owl:ObjectProperty)
-   (gnt:prePublicationDescription rdfs:domain gnc:phenotype)
-   (gnt:prePublicationDescription skos:definition "The pre publication details of this resource")
    (gnt:abbreviation a owl:ObjectProperty)
    (gnt:abbreviation rdfs:domain gnc:phenotype)
    (gnt:abbreviation skos:definition "The abbreviation used for this resource")
-   (gnt:labCode rdfs:range rdfs:Literal)
-   (gnt:submitter rdfs:range rdfs:Literal)
-   (gnt:owner rdfs:range rdfs:Literal)
+   (gnt:labCode a owl:ObjectProperty)
+   (gnt:labCode rdfs:domain gnc:phenotype)
+   (gnt:submitter a owl:ObjectProperty)
+   (gnt:submitter rdfs:domain gnc:phenotype)
+   (gnt:submitter skos:definition "A person who submitted this resource to GN")
+   (gnt:mean rdfs:domain gnc:phenotype)
    (gnt:mean rdfs:range xsd:double)
-   (gnt:LRS rdfs:range xsd:float)
+   (gnt:LRS rdfs:domain gnc:phenotype)
+   (gnt:LRS rdfs:range xsd:double)
+   (gnt:locus rdfs:domain gnc:phenotype)
    (gnt:locus rdfs:range rdfs:Literal)
-   (gnt:additive rdfs:range xsd:decimal)
-   (gnt:sequence rdfs:range rdfs:Literal)
-   (gnt:phenotypeOfPublication rdfs:range gn-term:pubMedId))
+   (gnt:additive rdfs:domain gnc:phenotype)
+   (gnt:additive rdfs:range xsd:double)
+   (gnt:sequence rdfs:domain gnc:phenotype)
+   (gnt:sequence rdfs:range xsd:integer))
   (triples (string->identifier
             ""
             (regexp-substitute/global #f "[^A-Za-z0-9:]"
@@ -80,32 +85,44 @@
             #:separator ""
             #:proc string-capitalize-first)
     (set rdf:type 'gnc:phenotype)
-    (set rdfs:label (sanitize-rdf-string
-                     (field
-                      ("IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Phenotype.Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation)"
-                       PhenotypeName))))
-    ;; There is no row with an empty post-publication description so
-    ;; use this field as the main publication description
+    (set skos:prefLabel (sanitize-rdf-string
+                         (field
+                          ("IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Phenotype.Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation)"
+                           PhenotypeName))))
+    ;; Add an alternative name for this resources.  This is how GN
+    ;; currently labels phenotypes
+    (set skos:altLabel (field
+                        ("CONCAT(InbredSet.Name, '_', PublishXRef.Id)"
+                         phenotypeAltName)))
+    ;; All phenotypes have a post-publication description
     (set dct:description
          (sanitize-rdf-string
-          (field Phenotype  Post_publication_description)))
-    (set gnt:prePublicationAbbreviation (sanitize-rdf-string (field Phenotype Pre_publication_abbreviation)))
-    (set gnt:postPublicationAbbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation)))
+          (field Phenotype Post_publication_description)))
+    ;; All phenotypes have a post-publication abbreviation
+    (set gnt:abbreviation (field Phenotype Post_publication_abbreviation))
     (set gnt:labCode (field Phenotype Lab_code))
-    (set gdmt:hasDistributorInfo
+    (set gnt:submitter
          (sanitize-rdf-string (field Phenotype Submitter)))
-    (set gnt:owner (sanitize-rdf-string (field Phenotype Owner)))
+    (set dct:contributor (sanitize-rdf-string (field Phenotype Owner)))
+    (multiset dct:contributor (string-split
+                               (sanitize-rdf-string (field Phenotype Owner))
+                               #\,))
     (set gnt:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean))
                                   '^^xsd:double))
     (set gnt:locus (field PublishXRef Locus))
-    (set gnt:LRS (annotate-field (field ("IFNULL(PublishXRef.LRS, '')" lrs)) '^^xsd:double))
-    (set gnt:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:double))
-    (set gnt:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:int))
+    (set gnt:LRS (annotate-field
+                  (field ("IFNULL(PublishXRef.LRS, '')" lrs))
+                  '^^xsd:double))
+    (set gnt:additive
+         (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive))
+                         '^^xsd:double))
+    (set gnt:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:integer))
     (set gnt:belongsToDataset
          (string->identifier
           ""
-          (field
-           ("IFNULL(InfoFiles.InfoPageName, IFNULL(PublishFreeze.Name, ''))" DatasetName))
+          (regexp-substitute/global #f "[^A-Za-z0-9:]"
+                                    (field InfoFiles InfoPageName)
+                                    'pre "_" 'post)
           #:separator ""
           #:proc string-capitalize-first))
     (set dct:isReferencedBy
@@ -126,6 +143,7 @@
  (prefixes
   '(("dct:" "<http://purl.org/dc/terms/>")
     ("gn:" "<http://genenetwork.org/id/>")
+    ("owl:" "<http://www.w3.org/2002/07/owl#>")
     ("gnc:" "<http://genenetwork.org/category/>")
     ("gnt:" "<http://genenetwork.org/terms/>")
     ("skos:" "<http://www.w3.org/2004/02/skos/core#>")
@@ -134,8 +152,10 @@
     ("xsd:" "<http://www.w3.org/2001/XMLSchema#>")
     ("pubmed:" "<http://rdf.ncbi.nlm.nih.gov/pubmed/>")))
  (inputs
-  (list dump-publishfreeze
-        dump-phenotypes))
+  (list
+   ;; dump-publishfreeze
+   dump-phenotypes
+   ))
  (outputs
   '(#:documentation "./docs/dump-phenotype.md"
     #:rdf "./verified-data/dump-phenotype.ttl")))