about summary refs log tree commit diff
diff options
context:
space:
mode:
-rwxr-xr-xdump.scm71
1 files changed, 71 insertions, 0 deletions
diff --git a/dump.scm b/dump.scm
index 5cfdd72..4a3f37f 100755
--- a/dump.scm
+++ b/dump.scm
@@ -726,6 +726,76 @@ must be remedied."
     (set gn:citation (field Datasets Citation))
     (set gn:acknowledgment (field Datasets Acknowledgment))))
 
+;; Dumping Phenotypes from PublishFreeze that are not present in the InfoFiles tables
+(define-dump dump-phenotypes
+  (tables (Phenotype
+           (left-join PublishXRef "ON Phenotype.Id = PublishXRef.PhenotypeId")
+           (left-join Publication "ON Publication.Id = PublishXRef.PublicationId")
+           (left-join PublishFreeze "ON PublishFreeze.InbredSetId = PublishXRef.InbredSetId")
+           (left-join InfoFiles "ON InfoFiles.InfoPageName = PublishFreeze.Name")))
+  (schema-triples
+   (gn:phenotypeDataset rdfs:subPropertyOf gn:dataset))
+  (triples (ontology 'phenotype:
+                     (regexp-substitute/global #f "[^A-Za-z0-9:]"
+                                               (field ("CONCAT(IF(PublishFreeze.Name IS NULL, '', CONCAT(PublishFreeze.Name, ':')), IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation))" abbrev))
+                                               'pre "_" 'post))
+    (set rdf:type 'gn:phenotype)
+    (set gn:name (sanitize-rdf-string
+                  (field
+                   ("CAST(CONVERT(BINARY CONVERT(CONCAT(IF(PublishFreeze.Name IS NULL, '', CONCAT(PublishFreeze.Name, '-')), IF(Phenotype.Post_publication_abbreviation IS NULL, IF(Phenotype.Pre_publication_abbreviation IS NULL, Phenotype.Id, Phenotype.Pre_publication_abbreviation), Phenotype.Post_publication_abbreviation)) USING latin1) USING utf8) AS VARCHAR(10000))"
+                    abbrev))))
+    ;; There is no row with an empty post-publication description so
+    ;; use this field as the main publication description
+    (set gn:publicationDescription
+         (sanitize-rdf-string
+          (field ("CAST(CONVERT(BINARY CONVERT(Phenotype.Post_publication_description USING latin1) USING utf8) AS CHAR(10000))"
+                  postPubDescr))))
+    (set gn:originalDescription (sanitize-rdf-string
+                                 (delete-substrings
+                                  (field Phenotype Original_description)
+                                  "Original post publication description: ")))
+    (set gn:prePublicationDescription
+         (sanitize-rdf-string
+          (field
+           ("CAST(CONVERT(BINARY CONVERT(Phenotype.Pre_publication_description USING latin1) USING utf8) AS VARCHAR(15000))"
+            prePubDesc))))
+    (set gn:prePublicationAbbreviation (sanitize-rdf-string (field Phenotype Pre_publication_abbreviation)))
+    (set gn:postPublicationAbbreviation (sanitize-rdf-string (field Phenotype Post_publication_abbreviation)))
+    (set gn:labCode (field Phenotype Lab_code))
+    (set gn:submitter (sanitize-rdf-string (field Phenotype Submitter)))
+    (set gn:owner (sanitize-rdf-string (field Phenotype Owner)))
+    (set gn:mean (annotate-field (field ("IFNULL(PublishXRef.mean, '')" mean))
+                                 '^^xsd:float))
+    (set gn:locus (field PublishXRef Locus))
+    (set gn:LRS (annotate-field (field ("IFNULL(PublishXRef.LRS, '')" lrs)) '^^xsd:float))
+    (set gn:additive (annotate-field (field ("IFNULL(PublishXRef.additive, '')" additive)) '^^xsd:decimal))
+    (set gn:sequence (annotate-field (field PublishXRef Sequence) '^^xsd:int))
+    (set gn:phenotypeOfDataset (string->identifier "dataset" (field PublishFreeze Name)))
+    (set gn:phenotypeOfPublication
+         (let ((pmid (field
+                      ("IF(Publication.PubMed_ID IS NULL, '', CONVERT(Publication.PubMed_Id, INT))"
+                       pmid)))
+               (publication-id (field Publication Id)))
+           (if (string-null? pmid)
+               (string->identifier "publication"
+                                   (number->string publication-id))
+               (ontology 'pubmed: pmid))))))
+
+(define-dump dump-genotypes
+  (tables (GenoFreeze
+           (left-join InfoFiles "ON InfoFiles.InfoPageName = GenoFreeze.Name"))
+          "WHERE GenoFreeze.Name NOT IN (SELECT DISTINCT InfoFiles.InfoPageName FROM InfoFiles)")
+  (schema-triples
+   (gn:phenotypeDataset rdfs:subPropertyOf gn:dataset))
+  (triples (string->identifier "dataset"
+                               (field GenoFreeze Name))
+    (set rdf:type 'gn:genotypeDataset)
+    (set gn:name (field GenoFreeze FullName))
+    (set dct:created (annotate-field
+                      (field GenoFreeze CreateTime)
+                      '^^xsd:datetime))))
+
+
 (define (dump-data-table db table-name data-field)
   (let ((dump-directory (string-append %dump-directory "/" table-name))
         (port #f)
@@ -960,6 +1030,7 @@ is a <table> object."
        (prefix "generif:" "<http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=>")
        (prefix "xsd:" "<http://www.w3.org/2001/XMLSchema#>")
        (prefix "owl:" "<http://www.w3.org/2002/07/owl#>")
+       (prefix "phenotype:" "<http://genenetwork.org/phenotype/>")
        (newline)
        (dump-genewiki-symbols db)
        (dump-gn-genewiki-entries db)