aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArun Isaac2021-08-27 05:49:11 -0500
committerArun Isaac2021-08-27 05:49:11 -0500
commitb6cb5c4d2c64781122223d4ee100a0d6f3dd03b9 (patch)
tree0b53ac191d07c4599d2b58c0d2a96561bcb23cf1
downloadgn-transform-databases-b6cb5c4d2c64781122223d4ee100a0d6f3dd03b9.tar.gz
Initial commit
-rw-r--r--README.org3
-rwxr-xr-xdump.scm318
-rw-r--r--dump/sql.scm54
-rw-r--r--guix.scm88
-rw-r--r--rdf.py25
5 files changed, 488 insertions, 0 deletions
diff --git a/README.org b/README.org
new file mode 100644
index 0000000..ad7ff32
--- /dev/null
+++ b/README.org
@@ -0,0 +1,3 @@
+The GeneNetwork database is being migrated from a relational database
+to a plain text and RDF database. This repository contains code to
+dump the relational database to plain text.
diff --git a/dump.scm b/dump.scm
new file mode 100755
index 0000000..596a188
--- /dev/null
+++ b/dump.scm
@@ -0,0 +1,318 @@
+#! /usr/bin/env guile
+!#
+
+(add-to-load-path (dirname (current-filename)))
+
+(use-modules (rnrs io ports)
+ (srfi srfi-1)
+ (srfi srfi-26)
+ (ice-9 match)
+ (ice-9 string-fun)
+ (dump sql))
+
+
+;;; GeneNetwork database connection parameters and dump path
+
+(define (call-with-genenetwork-database proc)
+ (let ((connection-settings (call-with-input-file "conn.scm" read)))
+ (call-with-database "mysql" (string-join
+ (list (assq-ref connection-settings 'username)
+ (assq-ref connection-settings 'password)
+ (assq-ref connection-settings 'database)
+ "tcp"
+ (assq-ref connection-settings 'host)
+ (number->string
+ (assq-ref connection-settings 'port)))
+ ":")
+ proc)))
+
+(define %dump-directory
+ (string-append (getenv "HOME") "/data/dump"))
+
+(define (call-with-dump-file filename proc)
+ (let ((absolute-path (string-append %dump-directory filename)))
+ (display absolute-path)
+ (newline)
+ (call-with-output-file absolute-path proc)))
+
+
+;; Dump schema annotations to org
+
+(define (get-tables-from-comments db)
+ (sql-map (match-lambda
+ ((("TableName" . table)) table))
+ db
+ "SELECT TableName FROM TableComments"))
+
+(define (dump-table-fields db table)
+ (format #t "* ~a~%" table)
+ (match (sql-find db (format #f "SELECT Comment FROM TableComments WHERE TableName = '~a'"
+ table))
+ ((("Comment" . comment))
+ (format #t "~a~%" comment)))
+ (sql-for-each (lambda (row)
+ (match row
+ ((("TableField" . table-field)
+ ("Foreign_Key" . foreign-key)
+ ("Annotation" . annotation))
+ (format #t "** ~a~%" (substring table-field (1+ (string-length table))))
+ (unless (string-null? foreign-key)
+ (format #t "Foreign key to ~a~%" foreign-key))
+ (unless (string-null? annotation)
+ (display annotation)
+ (newline)))))
+ db
+ (format #f "SELECT TableField, Foreign_Key, Annotation FROM TableFieldAnnotation WHERE TableField LIKE '~a.%'"
+ table))
+ (newline))
+
+(define (dump-schema-annotations db)
+ (call-with-genenetwork-database
+ (lambda (db)
+ (for-each (cut dump-table-fields db <>)
+ (get-tables-from-comments db)))))
+
+
+;;; Dump tables
+
+(define (camel->kebab str)
+ (call-with-output-string
+ (lambda (port)
+ (string-for-each (lambda (c)
+ (when (and (not (zero? (port-position port)))
+ (char-set-contains? char-set:upper-case c))
+ (put-char port #\-))
+ (put-char port (char-downcase c)))
+ str))))
+
+(define (snake->lower-camel str)
+ (let ((char-list (string->list str)))
+ (call-with-output-string
+ (lambda (port)
+ (put-char port (char-downcase (string-ref str 0)))
+ (map (lambda (char previous-char)
+ (unless (char=? char #\_)
+ (put-char port (if (char=? previous-char #\_)
+ (char-upcase char)
+ char))))
+ (drop char-list 1)
+ char-list)))))
+
+(define (camel->lower-camel str)
+ (string-append (string-downcase (substring str 0 1))
+ (substring str 1)))
+
+(define (scm->triples alist id)
+ (for-each (match-lambda
+ ((predicate . object)
+ (triple id predicate object)))
+ alist))
+
+(define (process-metadata-alist alist)
+ (filter-map (match-lambda
+ ((key . "") #f)
+ ((key . value)
+ (cons (string->symbol
+ (string-append
+ "gn:" (camel->lower-camel
+ (snake->lower-camel key))))
+ value)))
+ alist))
+
+(define (triple subject predicate object)
+ (format #t "~a ~a ~s .~%" subject predicate object))
+
+(define (binomial-name->species-id binomial-name)
+ (string->symbol
+ (string-append "gn:" (string-replace-substring binomial-name " " "_"))))
+
+(define (dump-species db)
+ (sql-for-each (lambda (alist)
+ (match alist
+ (((_ . common-name)
+ (_ . menu-name)
+ (_ . binomial-name))
+ (let ((id (binomial-name->species-id binomial-name)))
+ (triple id 'rdf:type 'gn:species)
+ ;; Common name
+ (triple id 'gn:name common-name)
+ ;; Menu name (TODO: Maybe, drop this
+ ;; field. It can be inferred from the
+ ;; common name.)
+ (triple id 'gn:menuName menu-name)
+ ;; Binomial name
+ (triple id 'gn:binomialName binomial-name)))))
+ db
+ "SELECT SpeciesName, MenuName, FullName FROM Species"))
+
+(define (dump-strain db)
+ (sql-for-each (lambda (alist)
+ (match alist
+ (((_ . binomial-name)
+ (_ . name)
+ (_ . name2)
+ (_ . symbol)
+ (_ . alias))
+ (let ((id
+ ;; TODO: Ensure this identifier does not collide.
+ (string-append "gn:"
+ (string-map (lambda (c)
+ (case c
+ ((#\/ #\< #\> #\+ #\( #\) #\space) #\_)
+ (else c)))
+ name))))
+ (triple id 'rdf:type 'gn:strain)
+ ;; The species this is a strain of
+ (triple id 'gn:strainOfSpecies
+ (binomial-name->species-id binomial-name))
+ ;; Name
+ (triple id 'gn:name name)
+ ;; A second name, if there is one
+ (unless (string=? name name2)
+ (triple id 'gn:name name2))
+ ;; Symbol, if there is one
+ (unless (string-null? symbol)
+ (triple id 'gn:symbol symbol))
+ ;; Alias, if there is one
+ (unless (string-null? alias)
+ (triple id 'gn:alias alias))))))
+ db
+ "SELECT Species.FullName, Strain.Name, Strain.Name2, Strain.Symbol, Strain.Alias FROM Strain JOIN Species ON Strain.SpeciesId = Species.SpeciesId"))
+
+(define (mapping-method-name->id name)
+ (string->symbol (string-append "gn:mappingMethod" name)))
+
+(define (dump-mapping-method db)
+ (sql-for-each (match-lambda
+ (((_ . name))
+ (triple (string-append "gn:mappingMethod" name)
+ 'rdf:type 'gn:mappingMethod)))
+ db
+ "SELECT Name FROM MappingMethod"))
+
+(define (inbred-set-name->id name)
+ (string->symbol (string-append "gn:inbredSet" name)))
+
+(define (dump-inbred-set db)
+ (sql-for-each (lambda (alist)
+ (let ((id (inbred-set-name->id (assoc-ref alist "Name"))))
+ (triple id 'rdf:type 'gn:phenotype)
+ (scm->triples
+ (filter-map (match-lambda
+ (('gn:binomialName . binomial-name)
+ (cons 'gn:inbredSetOfSpecies
+ (binomial-name->species-id binomial-name)))
+ (('gn:mappingMethodName . mapping-method-name)
+ (cons 'gn:inbredSetMappingMethod
+ (mapping-method-name->id mapping-method-name)))
+ (x x))
+ (process-metadata-alist alist))
+ id)))
+ db
+ "SELECT InbredSet.Name, InbredSet.FullName, GeneticType, Family,
+Species.FullName AS BinomialName, MappingMethod.Name AS MappingMethodName
+FROM InbredSet
+INNER JOIN Species USING (SpeciesId)
+INNER JOIN MappingMethod ON InbredSet.MappingMethodId = MappingMethod.Id"))
+
+(define (phenotype-id->id id)
+ (string->symbol (string-append "gn:phenotype" (number->string id))))
+
+(define (dump-phenotype db)
+ (sql-for-each (lambda (alist)
+ (let ((id (phenotype-id->id (assoc-ref alist "Id"))))
+ (triple id 'rdf:type 'gn:phenotype)
+ (scm->triples
+ (filter (match-lambda
+ (('gn:id . _) #f)
+ (('gn:units . value)
+ (string-ci=? value "unknown"))
+ (_ #t))
+ (process-metadata-alist alist))
+ id)))
+ db
+ "SELECT Id, Pre_publication_description, Post_publication_description,
+Original_description, Units, Pre_publication_abbreviation, Post_publication_abbreviation,
+Lab_code, Submitter, Owner, Authorized_Users FROM Phenotype"))
+
+(define (dump-publication db)
+ (sql-for-each (lambda (alist)
+ (let ((id (string-append "gn:publication"
+ (number->string (assoc-ref alist "Id")))))
+ (triple id 'rdf:type 'gn:publication)
+ (triple 'gn:title 'rdfs:subPropertyOf 'rdfs:label)
+ (scm->triples
+ (append-map (match-lambda
+ (('gn:id . _) '())
+ ;; The authors field is a comma
+ ;; separated list. Split it.
+ (('gn:authors . authors)
+ (map (lambda (author-name)
+ (cons 'gn:author (string-trim author-name)))
+ (string-split authors #\,)))
+ (('gn:abstract . abstract)
+ ;; TODO: Handle unprintable
+ ;; characters better.
+ (list (cons 'gn:abstract
+ (string-replace-substring abstract "\x01" ""))))
+ (x (list x)))
+ (process-metadata-alist alist))
+ id)))
+ db
+ "SELECT Id, PubMed_ID, Abstract, Authors, Title, Journal, Volume, Pages, Month, Year FROM Publication"))
+
+(define (dump-publish-xref db)
+ (sql-for-each (match-lambda
+ (((_ . inbred-set-name)
+ (_ . phenotype-id))
+ (triple (phenotype-id->id phenotype-id)
+ 'gn:phenotypeOfSpecies
+ (inbred-set-name->id inbred-set-name))))
+ db
+ "SELECT Name, PhenotypeId
+FROM PublishXRef
+INNER JOIN InbredSet USING (InbredSetId)"))
+
+(define (dump-data-table db table-name data-field)
+ (let ((dump-directory (string-append %dump-directory "/" table-name))
+ (port #f)
+ (current-strain-id #f))
+ (unless (file-exists? dump-directory)
+ (mkdir dump-directory))
+ (sql-for-each (match-lambda
+ (((_ . strain-id)
+ (_ . value))
+ ;; Close file if new strain.
+ (when (and port
+ (not (= current-strain-id strain-id)))
+ (close-port port)
+ (set! port #f))
+ ;; If no file is open, open new file.
+ (unless port
+ (set! current-strain-id strain-id)
+ (let ((filename (string-append dump-directory
+ "/" (number->string strain-id))))
+ (display filename (current-error-port))
+ (newline (current-error-port))
+ (set! port (open-output-file filename))))
+ (display value port)
+ (newline port)))
+ db
+ (format #f "SELECT StrainId, ~a FROM ~a ORDER BY StrainId"
+ data-field table-name))
+ (close-port port)))
+
+(call-with-genenetwork-database
+ (lambda (db)
+ (with-output-to-file (string-append %dump-directory "/dump.ttl")
+ (lambda ()
+ (format #t "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .~%")
+ (format #t "@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .~%")
+ (format #t "@prefix gn: <https://genenetwork.org/> .~%~%")
+ (dump-species db)
+ (dump-strain db)
+ (dump-mapping-method db)
+ (dump-inbred-set db)
+ (dump-phenotype db)
+ (dump-publication db)
+ (dump-publish-xref db)))))
diff --git a/dump/sql.scm b/dump/sql.scm
new file mode 100644
index 0000000..576d6cd
--- /dev/null
+++ b/dump/sql.scm
@@ -0,0 +1,54 @@
+;;; Database helpers
+;;;
+;;; These functions should have been a part of guile-dbi. Never too
+;;; late to contribute upstream!
+
+(define-module (dump sql)
+ #:use-module (srfi srfi-26)
+ #:use-module (ice-9 match)
+ #:use-module (dbi dbi)
+ #:export (call-with-database
+ sql-exec
+ sql-fold
+ sql-map
+ sql-for-each
+ sql-find))
+
+(define (call-with-database backend connection-string proc)
+ (let ((db #f))
+ (dynamic-wind (lambda ()
+ (set! db (dbi-open backend connection-string)))
+ (cut proc db)
+ (cut dbi-close db))))
+
+(define (database-check-status db)
+ (match (dbi-get_status db)
+ ((code . str)
+ (unless (zero? code)
+ (error str)))))
+
+(define (sql-exec db statement)
+ (dbi-query db statement)
+ (database-check-status db))
+
+(define (sql-fold proc init db statement)
+ (sql-exec db statement)
+ (let loop ((result init))
+ (let ((row (dbi-get_row db)))
+ (if row
+ (loop (proc row result))
+ result))))
+
+(define (sql-map proc db statement)
+ (sql-fold (lambda (row result)
+ (cons (proc row) result))
+ (list) db statement))
+
+(define (sql-for-each proc db statement)
+ (sql-fold (lambda (row _)
+ (proc row))
+ #f db statement))
+
+(define (sql-find db statement)
+ (sql-exec db statement)
+ (dbi-get_row db))
diff --git a/guix.scm b/guix.scm
new file mode 100644
index 0000000..b7aa107
--- /dev/null
+++ b/guix.scm
@@ -0,0 +1,88 @@
+;; Drop into a development environment using
+;;
+;; guix environment -m guix.scm
+;;
+;; Happy hacking!
+
+(use-modules (gnu packages autotools)
+ (gnu packages compression)
+ (gnu packages databases)
+ (gnu packages guile)
+ ((gnu packages guile-xyz) #:prefix guix:)
+ (gnu packages perl)
+ (gnu packages python)
+ (gnu packages python-web)
+ (gnu packages rdf)
+ (gnu packages tls)
+ (gnu packages web)
+ (guix build-system gnu)
+ (guix git-download)
+ ((guix licenses) #:prefix license:)
+ (guix packages)
+ (guix utils))
+
+;; Guix's guile-dbi package is missing a native-search-paths. Add
+;; it. TODO: Contribute upstream.
+(define guile-dbi
+ (package
+ (inherit guix:guile-dbi)
+ (native-search-paths
+ (list (search-path-specification
+ (variable "LD_LIBRARY_PATH")
+ (files '("lib")))))))
+
+(define guile-dbi-bootstrap
+ (package
+ (inherit guix:guile-dbi)
+ (name "guile-dbi-bootstrap")
+ (inputs '())
+ (arguments
+ (substitute-keyword-arguments (package-arguments guile-dbi)
+ ((#:make-flags _) '(list))))))
+
+;; TODO: Contribute guile-dbd-mysql upstream.
+(define guile-dbd-mysql
+ (let ((commit "e97589b6b018b206c901e4cc24db463407a4036b")
+ (revision "0"))
+ (package
+ (name "guile-dbd-mysql")
+ (version (git-version "2.1.6" revision commit))
+ (source
+ (origin
+ (method git-fetch)
+ (uri (git-reference
+ (url "https://github.com/opencog/guile-dbi")
+ (commit commit)))
+ (file-name (git-file-name name version))
+ (sha256
+ (base32
+ "0n1gv9a0kdys10a4qmnrwvg5sydwb03880asri4gqdchcj3fimni"))))
+ (build-system gnu-build-system)
+ (arguments
+ '(#:phases
+ (modify-phases %standard-phases
+ (add-after 'unpack 'chdir
+ (lambda _
+ ;; The upstream Git repository contains all the code, so change
+ ;; to the directory specific to guile-dbd-mysql.
+ (chdir "guile-dbd-mysql"))))))
+ (native-inputs
+ `(("autoconf" ,autoconf)
+ ("automake" ,automake)
+ ("guile-dbi-bootstrap" ,guile-dbi-bootstrap)
+ ("libtool" ,libtool)
+ ("openssl" ,openssl)
+ ("perl" ,perl)))
+ (inputs
+ `(("mysql" ,mysql)
+ ("zlib" ,zlib)))
+ (synopsis "Guile DBI driver for MySQL")
+ (home-page "https://github.com/opencog/guile-dbi/tree/master/guile-dbd-mysql")
+ (description "@code{guile-dbi} is a library for Guile that provides a
+convenient interface to SQL databases. This package implements the interface
+for MySQL.")
+ (license license:gpl2+))))
+
+(packages->manifest
+ (list guile-3.0 guile-dbi guile-dbd-mysql
+ python python-rdflib python-urllib3))
diff --git a/rdf.py b/rdf.py
new file mode 100644
index 0000000..c2bd5e4
--- /dev/null
+++ b/rdf.py
@@ -0,0 +1,25 @@
+import os.path
+from pathlib import Path
+from rdflib import Graph
+
+graph = Graph()
+graph.parse(location=os.path.join(Path.home(), 'data/dump/dump.ttl'),
+ format='text/turtle')
+
+query = """
+PREFIX gn: <https://genenetwork.org/>
+
+SELECT ?name ?binomial
+WHERE {
+ ?species rdf:type gn:species .
+ ?strain rdf:type gn:strain .
+ ?strain gn:name "JN9" .
+ ?strain gn:strainOfSpecies ?species .
+
+ ?species gn:name ?name .
+ ?species gn:binomialName ?binomial .
+}
+"""
+
+for result in graph.query(query):
+ print(result)