diff options
author | Arun Isaac | 2021-08-27 05:49:11 -0500 |
---|---|---|
committer | Arun Isaac | 2021-08-27 05:49:11 -0500 |
commit | b6cb5c4d2c64781122223d4ee100a0d6f3dd03b9 (patch) | |
tree | 0b53ac191d07c4599d2b58c0d2a96561bcb23cf1 | |
download | gn-transform-databases-b6cb5c4d2c64781122223d4ee100a0d6f3dd03b9.tar.gz |
Initial commit
-rw-r--r-- | README.org | 3 | ||||
-rwxr-xr-x | dump.scm | 318 | ||||
-rw-r--r-- | dump/sql.scm | 54 | ||||
-rw-r--r-- | guix.scm | 88 | ||||
-rw-r--r-- | rdf.py | 25 |
5 files changed, 488 insertions, 0 deletions
diff --git a/README.org b/README.org new file mode 100644 index 0000000..ad7ff32 --- /dev/null +++ b/README.org @@ -0,0 +1,3 @@ +The GeneNetwork database is being migrated from a relational database +to a plain text and RDF database. This repository contains code to +dump the relational database to plain text. diff --git a/dump.scm b/dump.scm new file mode 100755 index 0000000..596a188 --- /dev/null +++ b/dump.scm @@ -0,0 +1,318 @@ +#! /usr/bin/env guile +!# + +(add-to-load-path (dirname (current-filename))) + +(use-modules (rnrs io ports) + (srfi srfi-1) + (srfi srfi-26) + (ice-9 match) + (ice-9 string-fun) + (dump sql)) + + +;;; GeneNetwork database connection parameters and dump path + +(define (call-with-genenetwork-database proc) + (let ((connection-settings (call-with-input-file "conn.scm" read))) + (call-with-database "mysql" (string-join + (list (assq-ref connection-settings 'username) + (assq-ref connection-settings 'password) + (assq-ref connection-settings 'database) + "tcp" + (assq-ref connection-settings 'host) + (number->string + (assq-ref connection-settings 'port))) + ":") + proc))) + +(define %dump-directory + (string-append (getenv "HOME") "/data/dump")) + +(define (call-with-dump-file filename proc) + (let ((absolute-path (string-append %dump-directory filename))) + (display absolute-path) + (newline) + (call-with-output-file absolute-path proc))) + + +;; Dump schema annotations to org + +(define (get-tables-from-comments db) + (sql-map (match-lambda + ((("TableName" . table)) table)) + db + "SELECT TableName FROM TableComments")) + +(define (dump-table-fields db table) + (format #t "* ~a~%" table) + (match (sql-find db (format #f "SELECT Comment FROM TableComments WHERE TableName = '~a'" + table)) + ((("Comment" . comment)) + (format #t "~a~%" comment))) + (sql-for-each (lambda (row) + (match row + ((("TableField" . table-field) + ("Foreign_Key" . foreign-key) + ("Annotation" . annotation)) + (format #t "** ~a~%" (substring table-field (1+ (string-length table)))) + (unless (string-null? foreign-key) + (format #t "Foreign key to ~a~%" foreign-key)) + (unless (string-null? annotation) + (display annotation) + (newline))))) + db + (format #f "SELECT TableField, Foreign_Key, Annotation FROM TableFieldAnnotation WHERE TableField LIKE '~a.%'" + table)) + (newline)) + +(define (dump-schema-annotations db) + (call-with-genenetwork-database + (lambda (db) + (for-each (cut dump-table-fields db <>) + (get-tables-from-comments db))))) + + +;;; Dump tables + +(define (camel->kebab str) + (call-with-output-string + (lambda (port) + (string-for-each (lambda (c) + (when (and (not (zero? (port-position port))) + (char-set-contains? char-set:upper-case c)) + (put-char port #\-)) + (put-char port (char-downcase c))) + str)))) + +(define (snake->lower-camel str) + (let ((char-list (string->list str))) + (call-with-output-string + (lambda (port) + (put-char port (char-downcase (string-ref str 0))) + (map (lambda (char previous-char) + (unless (char=? char #\_) + (put-char port (if (char=? previous-char #\_) + (char-upcase char) + char)))) + (drop char-list 1) + char-list))))) + +(define (camel->lower-camel str) + (string-append (string-downcase (substring str 0 1)) + (substring str 1))) + +(define (scm->triples alist id) + (for-each (match-lambda + ((predicate . object) + (triple id predicate object))) + alist)) + +(define (process-metadata-alist alist) + (filter-map (match-lambda + ((key . "") #f) + ((key . value) + (cons (string->symbol + (string-append + "gn:" (camel->lower-camel + (snake->lower-camel key)))) + value))) + alist)) + +(define (triple subject predicate object) + (format #t "~a ~a ~s .~%" subject predicate object)) + +(define (binomial-name->species-id binomial-name) + (string->symbol + (string-append "gn:" (string-replace-substring binomial-name " " "_")))) + +(define (dump-species db) + (sql-for-each (lambda (alist) + (match alist + (((_ . common-name) + (_ . menu-name) + (_ . binomial-name)) + (let ((id (binomial-name->species-id binomial-name))) + (triple id 'rdf:type 'gn:species) + ;; Common name + (triple id 'gn:name common-name) + ;; Menu name (TODO: Maybe, drop this + ;; field. It can be inferred from the + ;; common name.) + (triple id 'gn:menuName menu-name) + ;; Binomial name + (triple id 'gn:binomialName binomial-name))))) + db + "SELECT SpeciesName, MenuName, FullName FROM Species")) + +(define (dump-strain db) + (sql-for-each (lambda (alist) + (match alist + (((_ . binomial-name) + (_ . name) + (_ . name2) + (_ . symbol) + (_ . alias)) + (let ((id + ;; TODO: Ensure this identifier does not collide. + (string-append "gn:" + (string-map (lambda (c) + (case c + ((#\/ #\< #\> #\+ #\( #\) #\space) #\_) + (else c))) + name)))) + (triple id 'rdf:type 'gn:strain) + ;; The species this is a strain of + (triple id 'gn:strainOfSpecies + (binomial-name->species-id binomial-name)) + ;; Name + (triple id 'gn:name name) + ;; A second name, if there is one + (unless (string=? name name2) + (triple id 'gn:name name2)) + ;; Symbol, if there is one + (unless (string-null? symbol) + (triple id 'gn:symbol symbol)) + ;; Alias, if there is one + (unless (string-null? alias) + (triple id 'gn:alias alias)))))) + db + "SELECT Species.FullName, Strain.Name, Strain.Name2, Strain.Symbol, Strain.Alias FROM Strain JOIN Species ON Strain.SpeciesId = Species.SpeciesId")) + +(define (mapping-method-name->id name) + (string->symbol (string-append "gn:mappingMethod" name))) + +(define (dump-mapping-method db) + (sql-for-each (match-lambda + (((_ . name)) + (triple (string-append "gn:mappingMethod" name) + 'rdf:type 'gn:mappingMethod))) + db + "SELECT Name FROM MappingMethod")) + +(define (inbred-set-name->id name) + (string->symbol (string-append "gn:inbredSet" name))) + +(define (dump-inbred-set db) + (sql-for-each (lambda (alist) + (let ((id (inbred-set-name->id (assoc-ref alist "Name")))) + (triple id 'rdf:type 'gn:phenotype) + (scm->triples + (filter-map (match-lambda + (('gn:binomialName . binomial-name) + (cons 'gn:inbredSetOfSpecies + (binomial-name->species-id binomial-name))) + (('gn:mappingMethodName . mapping-method-name) + (cons 'gn:inbredSetMappingMethod + (mapping-method-name->id mapping-method-name))) + (x x)) + (process-metadata-alist alist)) + id))) + db + "SELECT InbredSet.Name, InbredSet.FullName, GeneticType, Family, +Species.FullName AS BinomialName, MappingMethod.Name AS MappingMethodName +FROM InbredSet +INNER JOIN Species USING (SpeciesId) +INNER JOIN MappingMethod ON InbredSet.MappingMethodId = MappingMethod.Id")) + +(define (phenotype-id->id id) + (string->symbol (string-append "gn:phenotype" (number->string id)))) + +(define (dump-phenotype db) + (sql-for-each (lambda (alist) + (let ((id (phenotype-id->id (assoc-ref alist "Id")))) + (triple id 'rdf:type 'gn:phenotype) + (scm->triples + (filter (match-lambda + (('gn:id . _) #f) + (('gn:units . value) + (string-ci=? value "unknown")) + (_ #t)) + (process-metadata-alist alist)) + id))) + db + "SELECT Id, Pre_publication_description, Post_publication_description, +Original_description, Units, Pre_publication_abbreviation, Post_publication_abbreviation, +Lab_code, Submitter, Owner, Authorized_Users FROM Phenotype")) + +(define (dump-publication db) + (sql-for-each (lambda (alist) + (let ((id (string-append "gn:publication" + (number->string (assoc-ref alist "Id"))))) + (triple id 'rdf:type 'gn:publication) + (triple 'gn:title 'rdfs:subPropertyOf 'rdfs:label) + (scm->triples + (append-map (match-lambda + (('gn:id . _) '()) + ;; The authors field is a comma + ;; separated list. Split it. + (('gn:authors . authors) + (map (lambda (author-name) + (cons 'gn:author (string-trim author-name))) + (string-split authors #\,))) + (('gn:abstract . abstract) + ;; TODO: Handle unprintable + ;; characters better. + (list (cons 'gn:abstract + (string-replace-substring abstract "\x01" "")))) + (x (list x))) + (process-metadata-alist alist)) + id))) + db + "SELECT Id, PubMed_ID, Abstract, Authors, Title, Journal, Volume, Pages, Month, Year FROM Publication")) + +(define (dump-publish-xref db) + (sql-for-each (match-lambda + (((_ . inbred-set-name) + (_ . phenotype-id)) + (triple (phenotype-id->id phenotype-id) + 'gn:phenotypeOfSpecies + (inbred-set-name->id inbred-set-name)))) + db + "SELECT Name, PhenotypeId +FROM PublishXRef +INNER JOIN InbredSet USING (InbredSetId)")) + +(define (dump-data-table db table-name data-field) + (let ((dump-directory (string-append %dump-directory "/" table-name)) + (port #f) + (current-strain-id #f)) + (unless (file-exists? dump-directory) + (mkdir dump-directory)) + (sql-for-each (match-lambda + (((_ . strain-id) + (_ . value)) + ;; Close file if new strain. + (when (and port + (not (= current-strain-id strain-id))) + (close-port port) + (set! port #f)) + ;; If no file is open, open new file. + (unless port + (set! current-strain-id strain-id) + (let ((filename (string-append dump-directory + "/" (number->string strain-id)))) + (display filename (current-error-port)) + (newline (current-error-port)) + (set! port (open-output-file filename)))) + (display value port) + (newline port))) + db + (format #f "SELECT StrainId, ~a FROM ~a ORDER BY StrainId" + data-field table-name)) + (close-port port))) + +(call-with-genenetwork-database + (lambda (db) + (with-output-to-file (string-append %dump-directory "/dump.ttl") + (lambda () + (format #t "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .~%") + (format #t "@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .~%") + (format #t "@prefix gn: <https://genenetwork.org/> .~%~%") + (dump-species db) + (dump-strain db) + (dump-mapping-method db) + (dump-inbred-set db) + (dump-phenotype db) + (dump-publication db) + (dump-publish-xref db))))) diff --git a/dump/sql.scm b/dump/sql.scm new file mode 100644 index 0000000..576d6cd --- /dev/null +++ b/dump/sql.scm @@ -0,0 +1,54 @@ +;;; Database helpers +;;; +;;; These functions should have been a part of guile-dbi. Never too +;;; late to contribute upstream! + +(define-module (dump sql) + #:use-module (srfi srfi-26) + #:use-module (ice-9 match) + #:use-module (dbi dbi) + #:export (call-with-database + sql-exec + sql-fold + sql-map + sql-for-each + sql-find)) + +(define (call-with-database backend connection-string proc) + (let ((db #f)) + (dynamic-wind (lambda () + (set! db (dbi-open backend connection-string))) + (cut proc db) + (cut dbi-close db)))) + +(define (database-check-status db) + (match (dbi-get_status db) + ((code . str) + (unless (zero? code) + (error str))))) + +(define (sql-exec db statement) + (dbi-query db statement) + (database-check-status db)) + +(define (sql-fold proc init db statement) + (sql-exec db statement) + (let loop ((result init)) + (let ((row (dbi-get_row db))) + (if row + (loop (proc row result)) + result)))) + +(define (sql-map proc db statement) + (sql-fold (lambda (row result) + (cons (proc row) result)) + (list) db statement)) + +(define (sql-for-each proc db statement) + (sql-fold (lambda (row _) + (proc row)) + #f db statement)) + +(define (sql-find db statement) + (sql-exec db statement) + (dbi-get_row db)) diff --git a/guix.scm b/guix.scm new file mode 100644 index 0000000..b7aa107 --- /dev/null +++ b/guix.scm @@ -0,0 +1,88 @@ +;; Drop into a development environment using +;; +;; guix environment -m guix.scm +;; +;; Happy hacking! + +(use-modules (gnu packages autotools) + (gnu packages compression) + (gnu packages databases) + (gnu packages guile) + ((gnu packages guile-xyz) #:prefix guix:) + (gnu packages perl) + (gnu packages python) + (gnu packages python-web) + (gnu packages rdf) + (gnu packages tls) + (gnu packages web) + (guix build-system gnu) + (guix git-download) + ((guix licenses) #:prefix license:) + (guix packages) + (guix utils)) + +;; Guix's guile-dbi package is missing a native-search-paths. Add +;; it. TODO: Contribute upstream. +(define guile-dbi + (package + (inherit guix:guile-dbi) + (native-search-paths + (list (search-path-specification + (variable "LD_LIBRARY_PATH") + (files '("lib"))))))) + +(define guile-dbi-bootstrap + (package + (inherit guix:guile-dbi) + (name "guile-dbi-bootstrap") + (inputs '()) + (arguments + (substitute-keyword-arguments (package-arguments guile-dbi) + ((#:make-flags _) '(list)))))) + +;; TODO: Contribute guile-dbd-mysql upstream. +(define guile-dbd-mysql + (let ((commit "e97589b6b018b206c901e4cc24db463407a4036b") + (revision "0")) + (package + (name "guile-dbd-mysql") + (version (git-version "2.1.6" revision commit)) + (source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/opencog/guile-dbi") + (commit commit))) + (file-name (git-file-name name version)) + (sha256 + (base32 + "0n1gv9a0kdys10a4qmnrwvg5sydwb03880asri4gqdchcj3fimni")))) + (build-system gnu-build-system) + (arguments + '(#:phases + (modify-phases %standard-phases + (add-after 'unpack 'chdir + (lambda _ + ;; The upstream Git repository contains all the code, so change + ;; to the directory specific to guile-dbd-mysql. + (chdir "guile-dbd-mysql")))))) + (native-inputs + `(("autoconf" ,autoconf) + ("automake" ,automake) + ("guile-dbi-bootstrap" ,guile-dbi-bootstrap) + ("libtool" ,libtool) + ("openssl" ,openssl) + ("perl" ,perl))) + (inputs + `(("mysql" ,mysql) + ("zlib" ,zlib))) + (synopsis "Guile DBI driver for MySQL") + (home-page "https://github.com/opencog/guile-dbi/tree/master/guile-dbd-mysql") + (description "@code{guile-dbi} is a library for Guile that provides a +convenient interface to SQL databases. This package implements the interface +for MySQL.") + (license license:gpl2+)))) + +(packages->manifest + (list guile-3.0 guile-dbi guile-dbd-mysql + python python-rdflib python-urllib3)) @@ -0,0 +1,25 @@ +import os.path +from pathlib import Path +from rdflib import Graph + +graph = Graph() +graph.parse(location=os.path.join(Path.home(), 'data/dump/dump.ttl'), + format='text/turtle') + +query = """ +PREFIX gn: <https://genenetwork.org/> + +SELECT ?name ?binomial +WHERE { + ?species rdf:type gn:species . + ?strain rdf:type gn:strain . + ?strain gn:name "JN9" . + ?strain gn:strainOfSpecies ?species . + + ?species gn:name ?name . + ?species gn:binomialName ?binomial . +} +""" + +for result in graph.query(query): + print(result) |