aboutsummaryrefslogtreecommitdiff
path: root/dump.scm
diff options
context:
space:
mode:
Diffstat (limited to 'dump.scm')
-rwxr-xr-xdump.scm318
1 files changed, 318 insertions, 0 deletions
diff --git a/dump.scm b/dump.scm
new file mode 100755
index 0000000..596a188
--- /dev/null
+++ b/dump.scm
@@ -0,0 +1,318 @@
+#! /usr/bin/env guile
+!#
+
+(add-to-load-path (dirname (current-filename)))
+
+(use-modules (rnrs io ports)
+ (srfi srfi-1)
+ (srfi srfi-26)
+ (ice-9 match)
+ (ice-9 string-fun)
+ (dump sql))
+
+
+;;; GeneNetwork database connection parameters and dump path
+
+(define (call-with-genenetwork-database proc)
+ (let ((connection-settings (call-with-input-file "conn.scm" read)))
+ (call-with-database "mysql" (string-join
+ (list (assq-ref connection-settings 'username)
+ (assq-ref connection-settings 'password)
+ (assq-ref connection-settings 'database)
+ "tcp"
+ (assq-ref connection-settings 'host)
+ (number->string
+ (assq-ref connection-settings 'port)))
+ ":")
+ proc)))
+
+(define %dump-directory
+ (string-append (getenv "HOME") "/data/dump"))
+
+(define (call-with-dump-file filename proc)
+ (let ((absolute-path (string-append %dump-directory filename)))
+ (display absolute-path)
+ (newline)
+ (call-with-output-file absolute-path proc)))
+
+
+;; Dump schema annotations to org
+
+(define (get-tables-from-comments db)
+ (sql-map (match-lambda
+ ((("TableName" . table)) table))
+ db
+ "SELECT TableName FROM TableComments"))
+
+(define (dump-table-fields db table)
+ (format #t "* ~a~%" table)
+ (match (sql-find db (format #f "SELECT Comment FROM TableComments WHERE TableName = '~a'"
+ table))
+ ((("Comment" . comment))
+ (format #t "~a~%" comment)))
+ (sql-for-each (lambda (row)
+ (match row
+ ((("TableField" . table-field)
+ ("Foreign_Key" . foreign-key)
+ ("Annotation" . annotation))
+ (format #t "** ~a~%" (substring table-field (1+ (string-length table))))
+ (unless (string-null? foreign-key)
+ (format #t "Foreign key to ~a~%" foreign-key))
+ (unless (string-null? annotation)
+ (display annotation)
+ (newline)))))
+ db
+ (format #f "SELECT TableField, Foreign_Key, Annotation FROM TableFieldAnnotation WHERE TableField LIKE '~a.%'"
+ table))
+ (newline))
+
+(define (dump-schema-annotations db)
+ (call-with-genenetwork-database
+ (lambda (db)
+ (for-each (cut dump-table-fields db <>)
+ (get-tables-from-comments db)))))
+
+
+;;; Dump tables
+
+(define (camel->kebab str)
+ (call-with-output-string
+ (lambda (port)
+ (string-for-each (lambda (c)
+ (when (and (not (zero? (port-position port)))
+ (char-set-contains? char-set:upper-case c))
+ (put-char port #\-))
+ (put-char port (char-downcase c)))
+ str))))
+
+(define (snake->lower-camel str)
+ (let ((char-list (string->list str)))
+ (call-with-output-string
+ (lambda (port)
+ (put-char port (char-downcase (string-ref str 0)))
+ (map (lambda (char previous-char)
+ (unless (char=? char #\_)
+ (put-char port (if (char=? previous-char #\_)
+ (char-upcase char)
+ char))))
+ (drop char-list 1)
+ char-list)))))
+
+(define (camel->lower-camel str)
+ (string-append (string-downcase (substring str 0 1))
+ (substring str 1)))
+
+(define (scm->triples alist id)
+ (for-each (match-lambda
+ ((predicate . object)
+ (triple id predicate object)))
+ alist))
+
+(define (process-metadata-alist alist)
+ (filter-map (match-lambda
+ ((key . "") #f)
+ ((key . value)
+ (cons (string->symbol
+ (string-append
+ "gn:" (camel->lower-camel
+ (snake->lower-camel key))))
+ value)))
+ alist))
+
+(define (triple subject predicate object)
+ (format #t "~a ~a ~s .~%" subject predicate object))
+
+(define (binomial-name->species-id binomial-name)
+ (string->symbol
+ (string-append "gn:" (string-replace-substring binomial-name " " "_"))))
+
+(define (dump-species db)
+ (sql-for-each (lambda (alist)
+ (match alist
+ (((_ . common-name)
+ (_ . menu-name)
+ (_ . binomial-name))
+ (let ((id (binomial-name->species-id binomial-name)))
+ (triple id 'rdf:type 'gn:species)
+ ;; Common name
+ (triple id 'gn:name common-name)
+ ;; Menu name (TODO: Maybe, drop this
+ ;; field. It can be inferred from the
+ ;; common name.)
+ (triple id 'gn:menuName menu-name)
+ ;; Binomial name
+ (triple id 'gn:binomialName binomial-name)))))
+ db
+ "SELECT SpeciesName, MenuName, FullName FROM Species"))
+
+(define (dump-strain db)
+ (sql-for-each (lambda (alist)
+ (match alist
+ (((_ . binomial-name)
+ (_ . name)
+ (_ . name2)
+ (_ . symbol)
+ (_ . alias))
+ (let ((id
+ ;; TODO: Ensure this identifier does not collide.
+ (string-append "gn:"
+ (string-map (lambda (c)
+ (case c
+ ((#\/ #\< #\> #\+ #\( #\) #\space) #\_)
+ (else c)))
+ name))))
+ (triple id 'rdf:type 'gn:strain)
+ ;; The species this is a strain of
+ (triple id 'gn:strainOfSpecies
+ (binomial-name->species-id binomial-name))
+ ;; Name
+ (triple id 'gn:name name)
+ ;; A second name, if there is one
+ (unless (string=? name name2)
+ (triple id 'gn:name name2))
+ ;; Symbol, if there is one
+ (unless (string-null? symbol)
+ (triple id 'gn:symbol symbol))
+ ;; Alias, if there is one
+ (unless (string-null? alias)
+ (triple id 'gn:alias alias))))))
+ db
+ "SELECT Species.FullName, Strain.Name, Strain.Name2, Strain.Symbol, Strain.Alias FROM Strain JOIN Species ON Strain.SpeciesId = Species.SpeciesId"))
+
+(define (mapping-method-name->id name)
+ (string->symbol (string-append "gn:mappingMethod" name)))
+
+(define (dump-mapping-method db)
+ (sql-for-each (match-lambda
+ (((_ . name))
+ (triple (string-append "gn:mappingMethod" name)
+ 'rdf:type 'gn:mappingMethod)))
+ db
+ "SELECT Name FROM MappingMethod"))
+
+(define (inbred-set-name->id name)
+ (string->symbol (string-append "gn:inbredSet" name)))
+
+(define (dump-inbred-set db)
+ (sql-for-each (lambda (alist)
+ (let ((id (inbred-set-name->id (assoc-ref alist "Name"))))
+ (triple id 'rdf:type 'gn:phenotype)
+ (scm->triples
+ (filter-map (match-lambda
+ (('gn:binomialName . binomial-name)
+ (cons 'gn:inbredSetOfSpecies
+ (binomial-name->species-id binomial-name)))
+ (('gn:mappingMethodName . mapping-method-name)
+ (cons 'gn:inbredSetMappingMethod
+ (mapping-method-name->id mapping-method-name)))
+ (x x))
+ (process-metadata-alist alist))
+ id)))
+ db
+ "SELECT InbredSet.Name, InbredSet.FullName, GeneticType, Family,
+Species.FullName AS BinomialName, MappingMethod.Name AS MappingMethodName
+FROM InbredSet
+INNER JOIN Species USING (SpeciesId)
+INNER JOIN MappingMethod ON InbredSet.MappingMethodId = MappingMethod.Id"))
+
+(define (phenotype-id->id id)
+ (string->symbol (string-append "gn:phenotype" (number->string id))))
+
+(define (dump-phenotype db)
+ (sql-for-each (lambda (alist)
+ (let ((id (phenotype-id->id (assoc-ref alist "Id"))))
+ (triple id 'rdf:type 'gn:phenotype)
+ (scm->triples
+ (filter (match-lambda
+ (('gn:id . _) #f)
+ (('gn:units . value)
+ (string-ci=? value "unknown"))
+ (_ #t))
+ (process-metadata-alist alist))
+ id)))
+ db
+ "SELECT Id, Pre_publication_description, Post_publication_description,
+Original_description, Units, Pre_publication_abbreviation, Post_publication_abbreviation,
+Lab_code, Submitter, Owner, Authorized_Users FROM Phenotype"))
+
+(define (dump-publication db)
+ (sql-for-each (lambda (alist)
+ (let ((id (string-append "gn:publication"
+ (number->string (assoc-ref alist "Id")))))
+ (triple id 'rdf:type 'gn:publication)
+ (triple 'gn:title 'rdfs:subPropertyOf 'rdfs:label)
+ (scm->triples
+ (append-map (match-lambda
+ (('gn:id . _) '())
+ ;; The authors field is a comma
+ ;; separated list. Split it.
+ (('gn:authors . authors)
+ (map (lambda (author-name)
+ (cons 'gn:author (string-trim author-name)))
+ (string-split authors #\,)))
+ (('gn:abstract . abstract)
+ ;; TODO: Handle unprintable
+ ;; characters better.
+ (list (cons 'gn:abstract
+ (string-replace-substring abstract "\x01" ""))))
+ (x (list x)))
+ (process-metadata-alist alist))
+ id)))
+ db
+ "SELECT Id, PubMed_ID, Abstract, Authors, Title, Journal, Volume, Pages, Month, Year FROM Publication"))
+
+(define (dump-publish-xref db)
+ (sql-for-each (match-lambda
+ (((_ . inbred-set-name)
+ (_ . phenotype-id))
+ (triple (phenotype-id->id phenotype-id)
+ 'gn:phenotypeOfSpecies
+ (inbred-set-name->id inbred-set-name))))
+ db
+ "SELECT Name, PhenotypeId
+FROM PublishXRef
+INNER JOIN InbredSet USING (InbredSetId)"))
+
+(define (dump-data-table db table-name data-field)
+ (let ((dump-directory (string-append %dump-directory "/" table-name))
+ (port #f)
+ (current-strain-id #f))
+ (unless (file-exists? dump-directory)
+ (mkdir dump-directory))
+ (sql-for-each (match-lambda
+ (((_ . strain-id)
+ (_ . value))
+ ;; Close file if new strain.
+ (when (and port
+ (not (= current-strain-id strain-id)))
+ (close-port port)
+ (set! port #f))
+ ;; If no file is open, open new file.
+ (unless port
+ (set! current-strain-id strain-id)
+ (let ((filename (string-append dump-directory
+ "/" (number->string strain-id))))
+ (display filename (current-error-port))
+ (newline (current-error-port))
+ (set! port (open-output-file filename))))
+ (display value port)
+ (newline port)))
+ db
+ (format #f "SELECT StrainId, ~a FROM ~a ORDER BY StrainId"
+ data-field table-name))
+ (close-port port)))
+
+(call-with-genenetwork-database
+ (lambda (db)
+ (with-output-to-file (string-append %dump-directory "/dump.ttl")
+ (lambda ()
+ (format #t "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .~%")
+ (format #t "@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .~%")
+ (format #t "@prefix gn: <https://genenetwork.org/> .~%~%")
+ (dump-species db)
+ (dump-strain db)
+ (dump-mapping-method db)
+ (dump-inbred-set db)
+ (dump-phenotype db)
+ (dump-publication db)
+ (dump-publish-xref db)))))