about summary refs log tree commit diff
path: root/dump.scm
diff options
context:
space:
mode:
Diffstat (limited to 'dump.scm')
-rwxr-xr-xdump.scm318
1 files changed, 318 insertions, 0 deletions
diff --git a/dump.scm b/dump.scm
new file mode 100755
index 0000000..596a188
--- /dev/null
+++ b/dump.scm
@@ -0,0 +1,318 @@
+#! /usr/bin/env guile
+!#
+
+(add-to-load-path (dirname (current-filename)))
+
+(use-modules (rnrs io ports)
+             (srfi srfi-1)
+             (srfi srfi-26)
+             (ice-9 match)
+             (ice-9 string-fun)
+             (dump sql))
+
+
+;;; GeneNetwork database connection parameters and dump path
+
+(define (call-with-genenetwork-database proc)
+  (let ((connection-settings (call-with-input-file "conn.scm" read)))
+    (call-with-database "mysql" (string-join
+                                 (list (assq-ref connection-settings 'username)
+                                       (assq-ref connection-settings 'password)
+                                       (assq-ref connection-settings 'database)
+                                       "tcp"
+                                       (assq-ref connection-settings 'host)
+                                       (number->string
+                                        (assq-ref connection-settings 'port)))
+                                 ":")
+                        proc)))
+
+(define %dump-directory
+  (string-append (getenv "HOME") "/data/dump"))
+
+(define (call-with-dump-file filename proc)
+  (let ((absolute-path (string-append %dump-directory filename)))
+    (display absolute-path)
+    (newline)
+    (call-with-output-file absolute-path proc)))
+
+
+;; Dump schema annotations to org
+
+(define (get-tables-from-comments db)
+  (sql-map (match-lambda
+                  ((("TableName" . table)) table))
+                db
+                "SELECT TableName FROM TableComments"))
+
+(define (dump-table-fields db table)
+  (format #t "* ~a~%" table)
+  (match (sql-find db (format #f "SELECT Comment FROM TableComments WHERE TableName = '~a'"
+                                   table))
+    ((("Comment" . comment))
+     (format #t "~a~%" comment)))
+  (sql-for-each (lambda (row)
+                       (match row
+                         ((("TableField" . table-field)
+                           ("Foreign_Key" . foreign-key)
+                           ("Annotation" . annotation))
+                          (format #t "** ~a~%" (substring table-field (1+ (string-length table))))
+                          (unless (string-null? foreign-key)
+                            (format #t "Foreign key to ~a~%" foreign-key))
+                          (unless (string-null? annotation)
+                            (display annotation)
+                            (newline)))))
+                     db
+                     (format #f "SELECT TableField, Foreign_Key, Annotation FROM TableFieldAnnotation WHERE TableField LIKE '~a.%'"
+                             table))
+  (newline))
+
+(define (dump-schema-annotations db)
+  (call-with-genenetwork-database
+   (lambda (db)
+     (for-each (cut dump-table-fields db <>)
+               (get-tables-from-comments db)))))
+
+
+;;; Dump tables
+
+(define (camel->kebab str)
+  (call-with-output-string
+    (lambda (port)
+      (string-for-each (lambda (c)
+                         (when (and (not (zero? (port-position port)))
+                                    (char-set-contains? char-set:upper-case c))
+                           (put-char port #\-))
+                         (put-char port (char-downcase c)))
+                       str))))
+
+(define (snake->lower-camel str)
+  (let ((char-list (string->list str)))
+    (call-with-output-string
+      (lambda (port)
+        (put-char port (char-downcase (string-ref str 0)))
+        (map (lambda (char previous-char)
+               (unless (char=? char #\_)
+                 (put-char port (if (char=? previous-char #\_)
+                                    (char-upcase char)
+                                    char))))
+             (drop char-list 1)
+             char-list)))))
+
+(define (camel->lower-camel str)
+  (string-append (string-downcase (substring str 0 1))
+                 (substring str 1)))
+
+(define (scm->triples alist id)
+  (for-each (match-lambda
+              ((predicate . object)
+               (triple id predicate object)))
+            alist))
+
+(define (process-metadata-alist alist)
+  (filter-map (match-lambda
+                ((key . "") #f)
+                ((key . value)
+                 (cons (string->symbol
+                        (string-append
+                         "gn:" (camel->lower-camel
+                                (snake->lower-camel key))))
+                       value)))
+              alist))
+
+(define (triple subject predicate object)
+  (format #t "~a ~a ~s .~%" subject predicate object))
+
+(define (binomial-name->species-id binomial-name)
+  (string->symbol
+   (string-append "gn:" (string-replace-substring binomial-name " " "_"))))
+
+(define (dump-species db)
+  (sql-for-each (lambda (alist)
+                  (match alist
+                    (((_ . common-name)
+                      (_ . menu-name)
+                      (_ . binomial-name))
+                     (let ((id (binomial-name->species-id binomial-name)))
+                       (triple id 'rdf:type 'gn:species)
+                       ;; Common name
+                       (triple id 'gn:name common-name)
+                       ;; Menu name (TODO: Maybe, drop this
+                       ;; field. It can be inferred from the
+                       ;; common name.)
+                       (triple id 'gn:menuName menu-name)
+                       ;; Binomial name
+                       (triple id 'gn:binomialName binomial-name)))))
+                db
+                "SELECT SpeciesName, MenuName, FullName FROM Species"))
+
+(define (dump-strain db)
+  (sql-for-each (lambda (alist)
+                  (match alist
+                    (((_ . binomial-name)
+                      (_ . name)
+                      (_ . name2)
+                      (_ . symbol)
+                      (_ . alias))
+                     (let ((id
+                            ;; TODO: Ensure this identifier does not collide.
+                            (string-append "gn:"
+                                           (string-map (lambda (c)
+                                                         (case c
+                                                           ((#\/ #\< #\> #\+ #\( #\) #\space) #\_)
+                                                           (else c)))
+                                                       name))))
+                       (triple id 'rdf:type 'gn:strain)
+                       ;; The species this is a strain of
+                       (triple id 'gn:strainOfSpecies
+                               (binomial-name->species-id binomial-name))
+                       ;; Name
+                       (triple id 'gn:name name)
+                       ;; A second name, if there is one
+                       (unless (string=? name name2)
+                         (triple id 'gn:name name2))
+                       ;; Symbol, if there is one
+                       (unless (string-null? symbol)
+                         (triple id 'gn:symbol symbol))
+                       ;; Alias, if there is one
+                       (unless (string-null? alias)
+                         (triple id 'gn:alias alias))))))
+                db
+                "SELECT Species.FullName, Strain.Name, Strain.Name2, Strain.Symbol, Strain.Alias FROM Strain JOIN Species ON Strain.SpeciesId = Species.SpeciesId"))
+
+(define (mapping-method-name->id name)
+  (string->symbol (string-append "gn:mappingMethod" name)))
+
+(define (dump-mapping-method db)
+  (sql-for-each (match-lambda
+                  (((_ . name))
+                   (triple (string-append "gn:mappingMethod" name)
+                           'rdf:type 'gn:mappingMethod)))
+                db
+                "SELECT Name FROM MappingMethod"))
+
+(define (inbred-set-name->id name)
+  (string->symbol (string-append "gn:inbredSet" name)))
+
+(define (dump-inbred-set db)
+  (sql-for-each (lambda (alist)
+                  (let ((id (inbred-set-name->id (assoc-ref alist "Name"))))
+                    (triple id 'rdf:type 'gn:phenotype)
+                    (scm->triples
+                     (filter-map (match-lambda
+                                   (('gn:binomialName . binomial-name)
+                                    (cons 'gn:inbredSetOfSpecies
+                                          (binomial-name->species-id binomial-name)))
+                                   (('gn:mappingMethodName . mapping-method-name)
+                                    (cons 'gn:inbredSetMappingMethod
+                                          (mapping-method-name->id mapping-method-name)))
+                                   (x x))
+                                 (process-metadata-alist alist))
+                     id)))
+                db
+                "SELECT InbredSet.Name, InbredSet.FullName, GeneticType, Family,
+Species.FullName AS BinomialName, MappingMethod.Name AS MappingMethodName
+FROM InbredSet
+INNER JOIN Species USING (SpeciesId)
+INNER JOIN MappingMethod ON InbredSet.MappingMethodId = MappingMethod.Id"))
+
+(define (phenotype-id->id id)
+  (string->symbol (string-append "gn:phenotype" (number->string id))))
+
+(define (dump-phenotype db)
+  (sql-for-each (lambda (alist)
+                  (let ((id (phenotype-id->id (assoc-ref alist "Id"))))
+                    (triple id 'rdf:type 'gn:phenotype)
+                    (scm->triples
+                     (filter (match-lambda
+                               (('gn:id . _) #f)
+                               (('gn:units . value)
+                                (string-ci=? value "unknown"))
+                               (_ #t))
+                             (process-metadata-alist alist))
+                     id)))
+                db
+                "SELECT Id, Pre_publication_description, Post_publication_description,
+Original_description, Units, Pre_publication_abbreviation, Post_publication_abbreviation,
+Lab_code, Submitter, Owner, Authorized_Users FROM Phenotype"))
+
+(define (dump-publication db)
+  (sql-for-each (lambda (alist)
+                  (let ((id (string-append "gn:publication"
+                                           (number->string (assoc-ref alist "Id")))))
+                    (triple id 'rdf:type 'gn:publication)
+                    (triple 'gn:title 'rdfs:subPropertyOf 'rdfs:label)
+                    (scm->triples
+                     (append-map (match-lambda
+                                   (('gn:id . _) '())
+                                   ;; The authors field is a comma
+                                   ;; separated list. Split it.
+                                   (('gn:authors . authors)
+                                    (map (lambda (author-name)
+                                           (cons 'gn:author (string-trim author-name)))
+                                         (string-split authors #\,)))
+                                   (('gn:abstract . abstract)
+                                    ;; TODO: Handle unprintable
+                                    ;; characters better.
+                                    (list (cons 'gn:abstract
+                                                (string-replace-substring abstract "\x01" ""))))
+                                   (x (list x)))
+                                 (process-metadata-alist alist))
+                     id)))
+                db
+                "SELECT Id, PubMed_ID, Abstract, Authors, Title, Journal, Volume, Pages, Month, Year FROM Publication"))
+
+(define (dump-publish-xref db)
+  (sql-for-each (match-lambda
+                  (((_ . inbred-set-name)
+                    (_ . phenotype-id))
+                   (triple (phenotype-id->id phenotype-id)
+                           'gn:phenotypeOfSpecies
+                           (inbred-set-name->id inbred-set-name))))
+                db
+                "SELECT Name, PhenotypeId
+FROM PublishXRef
+INNER JOIN InbredSet USING (InbredSetId)"))
+
+(define (dump-data-table db table-name data-field)
+  (let ((dump-directory (string-append %dump-directory "/" table-name))
+        (port #f)
+        (current-strain-id #f))
+    (unless (file-exists? dump-directory)
+      (mkdir dump-directory))
+    (sql-for-each (match-lambda
+                    (((_ . strain-id)
+                      (_ . value))
+                     ;; Close file if new strain.
+                     (when (and port
+                                (not (= current-strain-id strain-id)))
+                       (close-port port)
+                       (set! port #f))
+                     ;; If no file is open, open new file.
+                     (unless port
+                       (set! current-strain-id strain-id)
+                       (let ((filename (string-append dump-directory
+                                                      "/" (number->string strain-id))))
+                         (display filename (current-error-port))
+                         (newline (current-error-port))
+                         (set! port (open-output-file filename))))
+                     (display value port)
+                     (newline port)))
+                  db
+                  (format #f "SELECT StrainId, ~a FROM ~a ORDER BY StrainId"
+                          data-field table-name))
+    (close-port port)))
+
+(call-with-genenetwork-database
+ (lambda (db)
+   (with-output-to-file (string-append %dump-directory "/dump.ttl")
+     (lambda ()
+       (format #t "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .~%")
+       (format #t "@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .~%")
+       (format #t "@prefix gn: <https://genenetwork.org/> .~%~%")
+       (dump-species db)
+       (dump-strain db)
+       (dump-mapping-method db)
+       (dump-inbred-set db)
+       (dump-phenotype db)
+       (dump-publication db)
+       (dump-publish-xref db)))))