diff options
-rw-r--r-- | README.md | 24 | ||||
-rw-r--r-- | conn.scm | 4 | ||||
-rw-r--r-- | dump/schema-dump.scm | 11 | ||||
-rwxr-xr-x | examples/dump-species-metadata.scm | 10 | ||||
-rw-r--r-- | schema/species.ttl | 8 |
5 files changed, 39 insertions, 18 deletions
@@ -3,9 +3,7 @@ badge](https://ci.genenetwork.org/badge/dump-genenetwork-database-tests.svg)](ht [![dump-genenetwork-database CI badge](https://ci.genenetwork.org/badge/dump-genenetwork-database.svg)](https://ci.genenetwork.org/jobs/dump-genenetwork-database) -The GeneNetwork database is being migrated from a relational database to -a plain text and RDF database. This repository contains code to dump the -relational database to plain text. +This repository contains code to dump the metadata in the GeneNetwork relational database to RDF. It requires a connection to a SQL server. # Using @@ -15,6 +13,12 @@ Drop into a development environment with $ guix shell -m manifest.scm ``` +If the path is not picked up add + +``` +export PATH=$GUIX_ENVIRONMENT/bin:$PATH +``` + Build the sources. ``` shell @@ -74,22 +78,22 @@ Here's a sample *conn.scm*. ## Dump the database -Then, to dump the database to \~/data/dump, run +Then, to dump the database to \~/data/dump, run inside shell -``` shell -$ guix shell -m manifest.scm -- ./pre-inst-env ./examples/dump-dataset-metadata.scm conn.scm ~/data/dump-data/ +```sh +./pre-inst-env ./examples/dump-species-metadata.scm ../conn.scm ~/tmp ``` -Make sure there is enough free space! It\'s best to dump the database on -penguin2 where disk space and bandwidth are not significant -constraints. +``` shell +$ guix shell -m manifest.scm -- ./pre-inst-env ./examples/dump-dataset-metadata.scm ../conn.scm ~/tmp +``` ## Validate and load dump Then, validate the dumped RDF using `rapper` and load it into virtuoso. This will load the dumped RDF into the `http://genenetwork.org` graph, and will delete all pre-existing data -in that graph. +in that graph (FIXME) ``` shell $ guix shell -m manifest.scm -- rapper --input turtle --count ~/data/dump/dump.ttl @@ -1,11 +1,11 @@ ((sql-username . "webqtlout") - (sql-password . "webqtlout") + (sql-password . "*") (sql-database . "db_webqtl") (sql-host . "localhost") (sql-port . 3306) (virtuoso-port . 8891) (virtuoso-username . "dba") - (virtuoso-password . "dba") + (virtuoso-password . "*") (sparql-scheme . http) (sparql-host . "localhost") (sparql-port . 8892) diff --git a/dump/schema-dump.scm b/dump/schema-dump.scm index 876eafb..86626f4 100644 --- a/dump/schema-dump.scm +++ b/dump/schema-dump.scm @@ -1,6 +1,10 @@ (define-module (dump schema) #:use-module (ice-9 match) - #:use-module (dump sql)) + #:use-module (ice-9 srfi-26) + #:use-module (dump sql) + #:use-module (dump triples) + #:use-module (dump strings) + #:use-module (dump table)) (define (dump-table-fields db table) @@ -90,8 +94,9 @@ is a <table> object." (table-columns table)))) tables))) -(define (dump-data-table db table-name data-field) - (let ((dump-directory (string-append %dump-directory "/" table-name)) +(define* (dump-data-table db table-name data-field + #:optional (default-dump-directory "")) + (let ((dump-directory (string-append default-dump-directory "/" table-name)) (port #f) (current-strain-id #f)) (unless (file-exists? dump-directory) diff --git a/examples/dump-species-metadata.scm b/examples/dump-species-metadata.scm index 39f7147..6ac2640 100755 --- a/examples/dump-species-metadata.scm +++ b/examples/dump-species-metadata.scm @@ -18,6 +18,16 @@ +(define (remap-species-identifiers str) + "This procedure remaps identifiers to standard binominal. Obviously this should + be sorted by correcting the database!" + (match str + ["Fly (Drosophila melanogaster dm6)" "Drosophila melanogaster"] + ["Oryzias latipes (Japanese medaka)" "Oryzias latipes"] + ["Monkey (Macaca nemestrina)" "Macaca nemestrina"] + ["Bat (Glossophaga soricina)" "Glossophaga soricina"] + [str str])) + (define-dump dump-species (tables (Species)) (schema-triples diff --git a/schema/species.ttl b/schema/species.ttl index 6b3acaf..cc5b989 100644 --- a/schema/species.ttl +++ b/schema/species.ttl @@ -6,14 +6,16 @@ @prefix gn: <http://genenetwork.org/id/> . gn:Arabidopsis_thaliana rdf:isDefinedBy wd:Q158695 . +# Bat gn:Glossophaga_soricina rdf:isDefinedBy wd:Q304929 . -gn:Drosophila_melanogaster_dm6 rdf:isDefinedBy wd:Q130888 . +gn:Drosophila_melanogaster rdf:isDefinedBy wd:Q130888 . gn:Glycine_max rdf:isDefinedBy wd:Q11006 . gn:Homo_sapiens rdf:isDefinedBy wd:Q15978631 . gn:Hordeum_vulgare rdf:isDefinedBy wd:Q11577 . -gn:Macaca_mulatta rdf:isDefinedBy wd:Q177601 . +gn:Macaca_mulatta rdf:isDefinedBy wd:Q156606 . gn:Mus_musculus rdf:isDefinedBy wd:Q83310 . -gn:Japanese_medaka rdf:isDefinedBy wd:Q1142975 . +# Japanese Medaka +gn:Oryzias_latipes rdf:isDefinedBy wd:Q1142975 . gn:Populus_trichocarpa rdf:isDefinedBy wd:Q149382 . gn:Rattus_norvegicus rdf:isDefinedBy wd:Q184224 . gn:Solanum_lycopersicum rdf:isDefinedBy wd:Q23501 . |