about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--README.org3
-rwxr-xr-xdump.scm318
-rw-r--r--dump/sql.scm54
-rw-r--r--guix.scm88
-rw-r--r--rdf.py25
5 files changed, 488 insertions, 0 deletions
diff --git a/README.org b/README.org
new file mode 100644
index 0000000..ad7ff32
--- /dev/null
+++ b/README.org
@@ -0,0 +1,3 @@
+The GeneNetwork database is being migrated from a relational database
+to a plain text and RDF database. This repository contains code to
+dump the relational database to plain text.
diff --git a/dump.scm b/dump.scm
new file mode 100755
index 0000000..596a188
--- /dev/null
+++ b/dump.scm
@@ -0,0 +1,318 @@
+#! /usr/bin/env guile
+!#
+
+(add-to-load-path (dirname (current-filename)))
+
+(use-modules (rnrs io ports)
+             (srfi srfi-1)
+             (srfi srfi-26)
+             (ice-9 match)
+             (ice-9 string-fun)
+             (dump sql))
+
+
+;;; GeneNetwork database connection parameters and dump path
+
+(define (call-with-genenetwork-database proc)
+  (let ((connection-settings (call-with-input-file "conn.scm" read)))
+    (call-with-database "mysql" (string-join
+                                 (list (assq-ref connection-settings 'username)
+                                       (assq-ref connection-settings 'password)
+                                       (assq-ref connection-settings 'database)
+                                       "tcp"
+                                       (assq-ref connection-settings 'host)
+                                       (number->string
+                                        (assq-ref connection-settings 'port)))
+                                 ":")
+                        proc)))
+
+(define %dump-directory
+  (string-append (getenv "HOME") "/data/dump"))
+
+(define (call-with-dump-file filename proc)
+  (let ((absolute-path (string-append %dump-directory filename)))
+    (display absolute-path)
+    (newline)
+    (call-with-output-file absolute-path proc)))
+
+
+;; Dump schema annotations to org
+
+(define (get-tables-from-comments db)
+  (sql-map (match-lambda
+                  ((("TableName" . table)) table))
+                db
+                "SELECT TableName FROM TableComments"))
+
+(define (dump-table-fields db table)
+  (format #t "* ~a~%" table)
+  (match (sql-find db (format #f "SELECT Comment FROM TableComments WHERE TableName = '~a'"
+                                   table))
+    ((("Comment" . comment))
+     (format #t "~a~%" comment)))
+  (sql-for-each (lambda (row)
+                       (match row
+                         ((("TableField" . table-field)
+                           ("Foreign_Key" . foreign-key)
+                           ("Annotation" . annotation))
+                          (format #t "** ~a~%" (substring table-field (1+ (string-length table))))
+                          (unless (string-null? foreign-key)
+                            (format #t "Foreign key to ~a~%" foreign-key))
+                          (unless (string-null? annotation)
+                            (display annotation)
+                            (newline)))))
+                     db
+                     (format #f "SELECT TableField, Foreign_Key, Annotation FROM TableFieldAnnotation WHERE TableField LIKE '~a.%'"
+                             table))
+  (newline))
+
+(define (dump-schema-annotations db)
+  (call-with-genenetwork-database
+   (lambda (db)
+     (for-each (cut dump-table-fields db <>)
+               (get-tables-from-comments db)))))
+
+
+;;; Dump tables
+
+(define (camel->kebab str)
+  (call-with-output-string
+    (lambda (port)
+      (string-for-each (lambda (c)
+                         (when (and (not (zero? (port-position port)))
+                                    (char-set-contains? char-set:upper-case c))
+                           (put-char port #\-))
+                         (put-char port (char-downcase c)))
+                       str))))
+
+(define (snake->lower-camel str)
+  (let ((char-list (string->list str)))
+    (call-with-output-string
+      (lambda (port)
+        (put-char port (char-downcase (string-ref str 0)))
+        (map (lambda (char previous-char)
+               (unless (char=? char #\_)
+                 (put-char port (if (char=? previous-char #\_)
+                                    (char-upcase char)
+                                    char))))
+             (drop char-list 1)
+             char-list)))))
+
+(define (camel->lower-camel str)
+  (string-append (string-downcase (substring str 0 1))
+                 (substring str 1)))
+
+(define (scm->triples alist id)
+  (for-each (match-lambda
+              ((predicate . object)
+               (triple id predicate object)))
+            alist))
+
+(define (process-metadata-alist alist)
+  (filter-map (match-lambda
+                ((key . "") #f)
+                ((key . value)
+                 (cons (string->symbol
+                        (string-append
+                         "gn:" (camel->lower-camel
+                                (snake->lower-camel key))))
+                       value)))
+              alist))
+
+(define (triple subject predicate object)
+  (format #t "~a ~a ~s .~%" subject predicate object))
+
+(define (binomial-name->species-id binomial-name)
+  (string->symbol
+   (string-append "gn:" (string-replace-substring binomial-name " " "_"))))
+
+(define (dump-species db)
+  (sql-for-each (lambda (alist)
+                  (match alist
+                    (((_ . common-name)
+                      (_ . menu-name)
+                      (_ . binomial-name))
+                     (let ((id (binomial-name->species-id binomial-name)))
+                       (triple id 'rdf:type 'gn:species)
+                       ;; Common name
+                       (triple id 'gn:name common-name)
+                       ;; Menu name (TODO: Maybe, drop this
+                       ;; field. It can be inferred from the
+                       ;; common name.)
+                       (triple id 'gn:menuName menu-name)
+                       ;; Binomial name
+                       (triple id 'gn:binomialName binomial-name)))))
+                db
+                "SELECT SpeciesName, MenuName, FullName FROM Species"))
+
+(define (dump-strain db)
+  (sql-for-each (lambda (alist)
+                  (match alist
+                    (((_ . binomial-name)
+                      (_ . name)
+                      (_ . name2)
+                      (_ . symbol)
+                      (_ . alias))
+                     (let ((id
+                            ;; TODO: Ensure this identifier does not collide.
+                            (string-append "gn:"
+                                           (string-map (lambda (c)
+                                                         (case c
+                                                           ((#\/ #\< #\> #\+ #\( #\) #\space) #\_)
+                                                           (else c)))
+                                                       name))))
+                       (triple id 'rdf:type 'gn:strain)
+                       ;; The species this is a strain of
+                       (triple id 'gn:strainOfSpecies
+                               (binomial-name->species-id binomial-name))
+                       ;; Name
+                       (triple id 'gn:name name)
+                       ;; A second name, if there is one
+                       (unless (string=? name name2)
+                         (triple id 'gn:name name2))
+                       ;; Symbol, if there is one
+                       (unless (string-null? symbol)
+                         (triple id 'gn:symbol symbol))
+                       ;; Alias, if there is one
+                       (unless (string-null? alias)
+                         (triple id 'gn:alias alias))))))
+                db
+                "SELECT Species.FullName, Strain.Name, Strain.Name2, Strain.Symbol, Strain.Alias FROM Strain JOIN Species ON Strain.SpeciesId = Species.SpeciesId"))
+
+(define (mapping-method-name->id name)
+  (string->symbol (string-append "gn:mappingMethod" name)))
+
+(define (dump-mapping-method db)
+  (sql-for-each (match-lambda
+                  (((_ . name))
+                   (triple (string-append "gn:mappingMethod" name)
+                           'rdf:type 'gn:mappingMethod)))
+                db
+                "SELECT Name FROM MappingMethod"))
+
+(define (inbred-set-name->id name)
+  (string->symbol (string-append "gn:inbredSet" name)))
+
+(define (dump-inbred-set db)
+  (sql-for-each (lambda (alist)
+                  (let ((id (inbred-set-name->id (assoc-ref alist "Name"))))
+                    (triple id 'rdf:type 'gn:phenotype)
+                    (scm->triples
+                     (filter-map (match-lambda
+                                   (('gn:binomialName . binomial-name)
+                                    (cons 'gn:inbredSetOfSpecies
+                                          (binomial-name->species-id binomial-name)))
+                                   (('gn:mappingMethodName . mapping-method-name)
+                                    (cons 'gn:inbredSetMappingMethod
+                                          (mapping-method-name->id mapping-method-name)))
+                                   (x x))
+                                 (process-metadata-alist alist))
+                     id)))
+                db
+                "SELECT InbredSet.Name, InbredSet.FullName, GeneticType, Family,
+Species.FullName AS BinomialName, MappingMethod.Name AS MappingMethodName
+FROM InbredSet
+INNER JOIN Species USING (SpeciesId)
+INNER JOIN MappingMethod ON InbredSet.MappingMethodId = MappingMethod.Id"))
+
+(define (phenotype-id->id id)
+  (string->symbol (string-append "gn:phenotype" (number->string id))))
+
+(define (dump-phenotype db)
+  (sql-for-each (lambda (alist)
+                  (let ((id (phenotype-id->id (assoc-ref alist "Id"))))
+                    (triple id 'rdf:type 'gn:phenotype)
+                    (scm->triples
+                     (filter (match-lambda
+                               (('gn:id . _) #f)
+                               (('gn:units . value)
+                                (string-ci=? value "unknown"))
+                               (_ #t))
+                             (process-metadata-alist alist))
+                     id)))
+                db
+                "SELECT Id, Pre_publication_description, Post_publication_description,
+Original_description, Units, Pre_publication_abbreviation, Post_publication_abbreviation,
+Lab_code, Submitter, Owner, Authorized_Users FROM Phenotype"))
+
+(define (dump-publication db)
+  (sql-for-each (lambda (alist)
+                  (let ((id (string-append "gn:publication"
+                                           (number->string (assoc-ref alist "Id")))))
+                    (triple id 'rdf:type 'gn:publication)
+                    (triple 'gn:title 'rdfs:subPropertyOf 'rdfs:label)
+                    (scm->triples
+                     (append-map (match-lambda
+                                   (('gn:id . _) '())
+                                   ;; The authors field is a comma
+                                   ;; separated list. Split it.
+                                   (('gn:authors . authors)
+                                    (map (lambda (author-name)
+                                           (cons 'gn:author (string-trim author-name)))
+                                         (string-split authors #\,)))
+                                   (('gn:abstract . abstract)
+                                    ;; TODO: Handle unprintable
+                                    ;; characters better.
+                                    (list (cons 'gn:abstract
+                                                (string-replace-substring abstract "\x01" ""))))
+                                   (x (list x)))
+                                 (process-metadata-alist alist))
+                     id)))
+                db
+                "SELECT Id, PubMed_ID, Abstract, Authors, Title, Journal, Volume, Pages, Month, Year FROM Publication"))
+
+(define (dump-publish-xref db)
+  (sql-for-each (match-lambda
+                  (((_ . inbred-set-name)
+                    (_ . phenotype-id))
+                   (triple (phenotype-id->id phenotype-id)
+                           'gn:phenotypeOfSpecies
+                           (inbred-set-name->id inbred-set-name))))
+                db
+                "SELECT Name, PhenotypeId
+FROM PublishXRef
+INNER JOIN InbredSet USING (InbredSetId)"))
+
+(define (dump-data-table db table-name data-field)
+  (let ((dump-directory (string-append %dump-directory "/" table-name))
+        (port #f)
+        (current-strain-id #f))
+    (unless (file-exists? dump-directory)
+      (mkdir dump-directory))
+    (sql-for-each (match-lambda
+                    (((_ . strain-id)
+                      (_ . value))
+                     ;; Close file if new strain.
+                     (when (and port
+                                (not (= current-strain-id strain-id)))
+                       (close-port port)
+                       (set! port #f))
+                     ;; If no file is open, open new file.
+                     (unless port
+                       (set! current-strain-id strain-id)
+                       (let ((filename (string-append dump-directory
+                                                      "/" (number->string strain-id))))
+                         (display filename (current-error-port))
+                         (newline (current-error-port))
+                         (set! port (open-output-file filename))))
+                     (display value port)
+                     (newline port)))
+                  db
+                  (format #f "SELECT StrainId, ~a FROM ~a ORDER BY StrainId"
+                          data-field table-name))
+    (close-port port)))
+
+(call-with-genenetwork-database
+ (lambda (db)
+   (with-output-to-file (string-append %dump-directory "/dump.ttl")
+     (lambda ()
+       (format #t "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .~%")
+       (format #t "@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .~%")
+       (format #t "@prefix gn: <https://genenetwork.org/> .~%~%")
+       (dump-species db)
+       (dump-strain db)
+       (dump-mapping-method db)
+       (dump-inbred-set db)
+       (dump-phenotype db)
+       (dump-publication db)
+       (dump-publish-xref db)))))
diff --git a/dump/sql.scm b/dump/sql.scm
new file mode 100644
index 0000000..576d6cd
--- /dev/null
+++ b/dump/sql.scm
@@ -0,0 +1,54 @@
+;;; Database helpers
+;;;
+;;; These functions should have been a part of guile-dbi. Never too
+;;; late to contribute upstream!
+
+(define-module (dump sql)
+  #:use-module (srfi srfi-26)
+  #:use-module (ice-9 match)
+  #:use-module (dbi dbi)
+  #:export (call-with-database
+            sql-exec
+            sql-fold
+            sql-map
+            sql-for-each
+            sql-find))
+
+(define (call-with-database backend connection-string proc)
+  (let ((db #f))
+    (dynamic-wind (lambda ()
+                    (set! db (dbi-open backend connection-string)))
+                  (cut proc db)
+                  (cut dbi-close db))))
+
+(define (database-check-status db)
+  (match (dbi-get_status db)
+    ((code . str)
+     (unless (zero? code)
+       (error str)))))
+
+(define (sql-exec db statement)
+  (dbi-query db statement)
+  (database-check-status db))
+
+(define (sql-fold proc init db statement)
+  (sql-exec db statement)
+  (let loop ((result init))
+    (let ((row (dbi-get_row db)))
+      (if row
+          (loop (proc row result))
+          result))))
+
+(define (sql-map proc db statement)
+  (sql-fold (lambda (row result)
+              (cons (proc row) result))
+            (list) db statement))
+
+(define (sql-for-each proc db statement)
+  (sql-fold (lambda (row _)
+              (proc row))
+            #f db statement))
+
+(define (sql-find db statement)
+  (sql-exec db statement)
+  (dbi-get_row db))
diff --git a/guix.scm b/guix.scm
new file mode 100644
index 0000000..b7aa107
--- /dev/null
+++ b/guix.scm
@@ -0,0 +1,88 @@
+;; Drop into a development environment using
+;;
+;; guix environment -m guix.scm
+;;
+;; Happy hacking!
+
+(use-modules (gnu packages autotools)
+             (gnu packages compression)
+             (gnu packages databases)
+             (gnu packages guile)
+             ((gnu packages guile-xyz) #:prefix guix:)
+             (gnu packages perl)
+             (gnu packages python)
+             (gnu packages python-web)
+             (gnu packages rdf)
+             (gnu packages tls)
+             (gnu packages web)
+             (guix build-system gnu)
+             (guix git-download)
+             ((guix licenses) #:prefix license:)
+             (guix packages)
+             (guix utils))
+
+;; Guix's guile-dbi package is missing a native-search-paths. Add
+;; it. TODO: Contribute upstream.
+(define guile-dbi
+  (package
+    (inherit guix:guile-dbi)
+    (native-search-paths
+     (list (search-path-specification
+            (variable "LD_LIBRARY_PATH")
+            (files '("lib")))))))
+
+(define guile-dbi-bootstrap
+  (package
+    (inherit guix:guile-dbi)
+    (name "guile-dbi-bootstrap")
+    (inputs '())
+    (arguments
+     (substitute-keyword-arguments (package-arguments guile-dbi)
+       ((#:make-flags _) '(list))))))
+
+;; TODO: Contribute guile-dbd-mysql upstream.
+(define guile-dbd-mysql
+  (let ((commit "e97589b6b018b206c901e4cc24db463407a4036b")
+        (revision "0"))
+    (package
+      (name "guile-dbd-mysql")
+      (version (git-version "2.1.6" revision commit))
+      (source
+       (origin
+         (method git-fetch)
+         (uri (git-reference
+               (url "https://github.com/opencog/guile-dbi")
+               (commit commit)))
+         (file-name (git-file-name name version))
+         (sha256
+          (base32
+           "0n1gv9a0kdys10a4qmnrwvg5sydwb03880asri4gqdchcj3fimni"))))
+      (build-system gnu-build-system)
+      (arguments
+       '(#:phases
+         (modify-phases %standard-phases
+           (add-after 'unpack 'chdir
+             (lambda _
+               ;; The upstream Git repository contains all the code, so change
+               ;; to the directory specific to guile-dbd-mysql.
+               (chdir "guile-dbd-mysql"))))))
+      (native-inputs
+       `(("autoconf" ,autoconf)
+         ("automake" ,automake)
+         ("guile-dbi-bootstrap" ,guile-dbi-bootstrap)
+         ("libtool" ,libtool)
+         ("openssl" ,openssl)
+         ("perl" ,perl)))
+      (inputs
+       `(("mysql" ,mysql)
+         ("zlib" ,zlib)))
+      (synopsis "Guile DBI driver for MySQL")
+      (home-page "https://github.com/opencog/guile-dbi/tree/master/guile-dbd-mysql")
+      (description "@code{guile-dbi} is a library for Guile that provides a
+convenient interface to SQL databases.  This package implements the interface
+for MySQL.")
+      (license license:gpl2+))))
+
+(packages->manifest
+ (list guile-3.0 guile-dbi guile-dbd-mysql
+       python python-rdflib python-urllib3))
diff --git a/rdf.py b/rdf.py
new file mode 100644
index 0000000..c2bd5e4
--- /dev/null
+++ b/rdf.py
@@ -0,0 +1,25 @@
+import os.path
+from pathlib import Path
+from rdflib import Graph
+
+graph = Graph()
+graph.parse(location=os.path.join(Path.home(), 'data/dump/dump.ttl'),
+            format='text/turtle')
+
+query = """
+PREFIX gn: <https://genenetwork.org/>
+
+SELECT ?name ?binomial
+WHERE {
+  ?species rdf:type gn:species .
+  ?strain rdf:type gn:strain .
+  ?strain gn:name "JN9" .
+  ?strain gn:strainOfSpecies ?species .
+
+  ?species gn:name ?name .
+  ?species gn:binomialName ?binomial .
+}
+"""
+
+for result in graph.query(query):
+    print(result)