diff options
Diffstat (limited to 'doc')
-rw-r--r-- | doc/API_readme.md | 169 | ||||
-rw-r--r-- | doc/GUIX-Reproducible-from-source.org | 356 | ||||
-rw-r--r-- | doc/README.org | 462 | ||||
-rw-r--r-- | doc/database.org | 165 | ||||
-rw-r--r-- | doc/development.org | 43 | ||||
-rw-r--r-- | doc/elasticsearch.org | 247 |
6 files changed, 1113 insertions, 329 deletions
diff --git a/doc/API_readme.md b/doc/API_readme.md new file mode 100644 index 00000000..652376a0 --- /dev/null +++ b/doc/API_readme.md @@ -0,0 +1,169 @@ +# API Query Documentation # +--- +# Fetching Dataset/Trait info/data # +--- +## Fetch Species List ## + +To get a list of species with data available in GN (and their associated names and ids): +``` +curl http://gn2.genenetwork.org/api/v_pre1/species +[ { "FullName": "Mus musculus", "Id": 1, "Name": "mouse", "TaxonomyId": 10090 }, ... { "FullName": "Populus trichocarpa", "Id": 10, "Name": "poplar", "TaxonomyId": 3689 } ] +``` + +Or to get a single species info: +``` +curl http://gn2.genenetwork.org/api/v_pre1/species/mouse +``` +OR +``` +curl http://gn2.genenetwork.org/api/v_pre1/species/mouse.json +``` + +*For all queries where the last field is a user-specified name/ID, there will be the option to append a file format type. Currently there is only JSON (and it will default to JSON if none is provided), but other formats will be added later* + +## Fetch Groups/RISets ## + +This query can optionally filter by species: + +``` +curl http://gn2.genenetwork.org/api/v_pre1/groups (for all species) +``` +OR +``` +curl http://gn2.genenetwork.org/api/v_pre1/groups/mouse (for just mouse groups/RISets) +[ { "DisplayName": "BXD", "FullName": "BXD RI Family", "GeneticType": "riset", "Id": 1, "MappingMethodId": "1", "Name": "BXD", "SpeciesId": 1, "public": 2 }, ... { "DisplayName": "AIL LGSM F34 and F39-43 (GBS)", "FullName": "AIL LGSM F34 and F39-43 (GBS)", "GeneticType": "intercross", "Id": 72, "MappingMethodId": "2", "Name": "AIL-LGSM-F34-F39-43-GBS", "SpeciesId": 1, "public": 2 } ] +``` + +## Fetch Genotypes for Group/RISet ## +``` +curl http://gn2.genenetwork.org/api/v_pre1/genotypes/bimbam/BXD +curl http://gn2.genenetwork.org/api/v_pre1/genotypes/BXD.bimbam +``` +Returns a group's genotypes in one of several formats - bimbam, rqtl2, or geno (a format used by qtlreaper which is just a CSV file consisting of marker positions and genotypes) + +Rqtl2 genotype queries can also include the dataset name and will return a zip of the genotypes, phenotypes, and gene map (marker names/positions). For example: +``` +curl http://gn2.genenetwork.org/api/v_pre1/genotypes/rqtl2/BXD/HC_M2_0606_P.zip +``` + +## Fetch Datasets ## +``` +curl http://gn2.genenetwork.org/api/v_pre1/datasets/bxd +``` +OR +``` +curl http://gn2.genenetwork.org/api/v_pre1/datasets/mouse/bxd +[ { "AvgID": 1, "CreateTime": "Fri, 01 Aug 2003 00:00:00 GMT", "DataScale": "log2", "FullName": "UTHSC/ETHZ/EPFL BXD Liver Polar Metabolites Extraction A, CD Cohorts (Mar 2017) log2", "Id": 1, "Long_Abbreviation": "BXDMicroArray_ProbeSet_August03", "ProbeFreezeId": 3, "ShortName": "Brain U74Av2 08/03 MAS5", "Short_Abbreviation": "Br_U_0803_M", "confidentiality": 0, "public": 0 }, ... { "AvgID": 3, "CreateTime": "Tue, 14 Aug 2018 00:00:00 GMT", "DataScale": "log2", "FullName": "EPFL/LISP BXD CD Liver Affy Mouse Gene 1.0 ST (Aug18) RMA", "Id": 859, "Long_Abbreviation": "EPFLMouseLiverCDRMAApr18", "ProbeFreezeId": 181, "ShortName": "EPFL/LISP BXD CD Liver Affy Mouse Gene 1.0 ST (Aug18) RMA", "Short_Abbreviation": "EPFLMouseLiverCDRMA0818", "confidentiality": 0, "public": 1 } ] +``` +(I added the option to specify species just in case we end up with the same group name across multiple species at some point, though it's currently unnecessary) + +## Fetch Individual Dataset Info ## +### For mRNA Assay/"ProbeSet" ### + +``` +curl http://gn2.genenetwork.org/api/v_pre1/dataset/HC_M2_0606_P +``` +OR +``` +curl http://gn2.genenetwork.org/api/v_pre1/dataset/bxd/HC_M2_0606_P``` +{ "confidential": 0, "data_scale": "log2", "dataset_type": "mRNA expression", "full_name": "Hippocampus Consortium M430v2 (Jun06) PDNN", "id": 112, "name": "HC_M2_0606_P", "public": 2, "short_name": "Hippocampus M430v2 BXD 06/06 PDNN", "tissue": "Hippocampus mRNA", "tissue_id": 9 } +``` +(This also has the option to specify group/riset) + +### For "Phenotypes" (basically non-mRNA Expression; stuff like weight, sex, etc) ### +For these traits, the query fetches publication info and takes the group and phenotype 'ID' as input. For example: +``` +curl http://gn2.genenetwork.org/api/v_pre1/dataset/bxd/10001 +{ "dataset_type": "phenotype", "description": "Central nervous system, morphology: Cerebellum weight, whole, bilateral in adults of both sexes [mg]", "id": 10001, "name": "CBLWT2", "pubmed_id": 11438585, "title": "Genetic control of the mouse cerebellum: identification of quantitative trait loci modulating size and architecture", "year": "2001" } +``` + +## Fetch Sample Data for Dataset ## +``` +curl http://gn2.genenetwork.org/api/v_pre1/sample_data/HSNIH-PalmerPublish.csv +``` + +Returns a CSV file with sample/strain names as the columns and trait IDs as rows + +## Fetch Sample Data for Single Trait ## +``` +curl http://gn2.genenetwork.org/api/v_pre1/sample_data/HC_M2_0606_P/1436869_at +[ { "data_id": 23415463, "sample_name": "129S1/SvImJ", "sample_name_2": "129S1/SvImJ", "se": 0.123, "value": 8.201 }, { "data_id": 23415463, "sample_name": "A/J", "sample_name_2": "A/J", "se": 0.046, "value": 8.413 }, { "data_id": 23415463, "sample_name": "AKR/J", "sample_name_2": "AKR/J", "se": 0.134, "value": 8.856 }, ... ] +``` + +## Fetch Trait List for Dataset ## +``` +curl http://gn2.genenetwork.org/api/v_pre1/traits/HXBBXHPublish.json +[ { "Additive": 0.0499967532467532, "Id": 10001, "LRS": 16.2831307029479, "Locus": "rs106114574", "PhenotypeId": 1449, "PublicationId": 319, "Sequence": 1 }, ... ] +``` + +Both JSON and CSV formats can be specified, with JSON as default. There is also an optional "ids_only" and "names_only" parameter that will only return a list of trait IDs or names, respectively. + +## Fetch Trait Info (Name, Description, Location, etc) ## +### For mRNA Expression/"ProbeSet" ### +``` +curl http://gn2.genenetwork.org/api/v_pre1/trait/HC_M2_0606_P/1436869_at +{ "additive": -0.214087568058076, "alias": "HHG1; HLP3; HPE3; SMMCI; Dsh; Hhg1", "chr": "5", "description": "sonic hedgehog (hedgehog)", "id": 99602, "locus": "rs8253327", "lrs": 12.7711275309832, "mb": 28.457155, "mean": 9.27909090909091, "name": "1436869_at", "p_value": 0.306, "se": null, "symbol": "Shh" } +``` + +### For "Phenotypes" ### +For phenotypes this just gets the max LRS, its location, and additive effect (as calculated by qtlreaper) + +Since each group/riset only has one phenotype "dataset", this query takes either the group/riset name or the group/riset name + "Publish" (for example "BXDPublish", which is the dataset name in the DB) as input +``` +curl http://gn2.genenetwork.org/api/v_pre1/trait/BXD/10001 +{ "additive": 2.39444435069444, "id": 4, "locus": "rs48756159", "lrs": 13.4974911471087 } +``` + +--- + +# Analyses # +--- +## Mapping ## +Currently two mapping tools can be used - GEMMA and R/qtl. qtlreaper will be added later with Christian Fischer's RUST implementation - https://github.com/chfi/rust-qtlreaper + +Each method's query takes the following parameters respectively (more will be added): +### GEMMA ### +* trait_id (*required*) - ID for trait being mapped +* db (*required*) - DB name for trait above (Short_Abbreviation listed when you query for datasets) +* use_loco - Whether to use LOCO (leave one chromosome out) method (default = false) +* maf - minor allele frequency (default = 0.01) + +Example query: +``` +curl http://gn2.genenetwork.org/api/v_pre1/mapping?trait_id=10015&db=BXDPublish&method=gemma&use_loco=true +``` + +### R/qtl ### +(See the R/qtl guide for information on some of these options - http://www.rqtl.org/manual/qtl-manual.pdf) +* trait_id (*required*) - ID for trait being mapped +* db (*required*) - DB name for trait above (Short_Abbreviation listed when you query for datasets) +* rqtl_method - hk (default) | ehk | em | imp | mr | mr-imp | mr-argmax ; Corresponds to the "method" option for the R/qtl scanone function. +* rqtl_model - normal (default) | binary | 2-part | np ; corresponds to the "model" option for the R/qtl scanone function +* num_perm - number of permutations; 0 by default +* control_marker - Name of marker to use as control; this relies on the user knowing the name of the marker they want to use as a covariate +* interval_mapping - Whether to use interval mapping; "false" by default +* pair_scan - *NYI* + +Example query: +``` +curl http://gn2.genenetwork.org/api/v_pre1/mapping?trait_id=1418701_at&db=HC_M2_0606_P&method=rqtl&num_perm=100 +``` + +Some combinations of methods/models may not make sense. The R/qtl manual should be referred to for any questions on its use (specifically the scanone function in this case) + +## Calculate Correlation ## +Currently only Sample and Tissue correlations are implemented + +This query currently takes the following parameters (though more will be added): +* trait_id (*required*) - ID for trait used for correlation +* db (*required*) - DB name for the trait above (this is the Short_Abbreviation listed when you query for datasets) +* target_db (*required*) - Target DB name to be correlated against +* type - sample (default) | tissue +* method - pearson (default) | spearman +* return - Number of results to return (default = 500) + +Example query: +``` +curl http://gn2.genenetwork.org/api/v_pre1/correlation?trait_id=1427571_at&db=HC_M2_0606_P&target_db=BXDPublish&type=sample&return_count=100 +[ { "#_strains": 6, "p_value": 0.004804664723032055, "sample_r": -0.942857142857143, "trait": 20511 }, { "#_strains": 6, "p_value": 0.004804664723032055, "sample_r": -0.942857142857143, "trait": 20724 }, { "#_strains": 12, "p_value": 1.8288943424888848e-05, "sample_r": -0.9233615170820528, "trait": 13536 }, { "#_strains": 7, "p_value": 0.006807187408935392, "sample_r": 0.8928571428571429, "trait": 10157 }, { "#_strains": 7, "p_value": 0.006807187408935392, "sample_r": -0.8928571428571429, "trait": 20392 }, ... ] +``` diff --git a/doc/GUIX-Reproducible-from-source.org b/doc/GUIX-Reproducible-from-source.org index 4399ea26..83adce99 100644 --- a/doc/GUIX-Reproducible-from-source.org +++ b/doc/GUIX-Reproducible-from-source.org @@ -2,19 +2,188 @@ * Table of Contents :TOC: - [[#introduction][Introduction]] - - [[#binary-deployment][Binary deployment]] + - [[#binary-deployment-through-gnu-guix][Binary deployment through GNU Guix]] + - [[#quick-installation-recipe][Quick installation recipe]] + - [[#step-1-install-gnu-guix][Step 1: Install GNU Guix]] + - [[#step-2-checkout-the-gn2-git-repositories][Step 2: Checkout the GN2 git repositories]] + - [[#step-3-authorize-the-gn-guix-server][Step 3: Authorize the GN Guix server]] + - [[#step-4-install-and-run-gn2][Step 4: Install and run GN2]] - [[#from-source-deployment][From source deployment]] - [[#create-archive][Create archive]] + - [[#source-deployment][Source deployment]] + - [[#run-your-own-copy-of-gn2][Run your own copy of GN2]] + - [[#set-up-nginx-port-forwarding][Set up nginx port forwarding]] + - [[#source-deployment-and-other-information-on-reproducibility][Source deployment and other information on reproducibility]] + - [[#update-to-recent-guix][Update to recent guix]] + - [[#install-gn2][Install GN2]] + - [[#run-gn2][Run GN2]] * Introduction Large system deployments tend to get very complex. In this document we explain the GeneNetwork deployment system which is based on GNU Guix -(see Pjotr's [[https://github.com/pjotrp/guix-notes/blob/master/README.md][Guix-notes]]). +(see Pjotr's [[https://github.com/pjotrp/guix-notes/blob/master/README.md][Guix-notes]] and the main [[README.org]] doc). -* Binary deployment +* Binary deployment through GNU Guix +** Quick installation recipe -NYA (will go to README) +This is a recipe for quick and dirty installation of GN2. For +convenience everything is installed as root, though in reality only +GNU Guix has to be installed as root. I tested this recipe on a fresh +install of Debian 8.3.0 (in KVM) though it should work on any modern +Linux distribution (including CentOS). + + +Note that GN2 consists of an approx. 5 GB installation including +database. If you use a virtual machine we recommend to use at least +double. + +** Step 1: Install GNU Guix + +Fetch the GNU Guix binary from [[https://www.gnu.org/software/guix/download/][here]] (middle panel) and follow +[[https://www.gnu.org/software/guix/manual/html_node/Binary-Installation.html][instructions]]. Essentially, download and unpack the tar ball (which +creates directories in /gnu and /var/guix), add build users and group +(Guix builds software as unpriviliged users) and run the Guix daemon +after fixing the paths (also known as the 'profile'). + +Once you have succeeded, you have to [[https://github.com/pjotrp/guix-notes/blob/master/INSTALL.org#set-the-key][set the key]] (getting permission +to download binaries from the GNU server) and you should be able to +install the hello package using binary packages (no building) + +#+begin_src bash +export PATH=~/.guix-profile/bin:$PATH +guix pull +guix package -i hello --dry-run +#+end_src + +Which should show something like + +: The following files would be downloaded: +: /gnu/store/zby49aqfbd9w9br4l52mvb3y6f9vfv22-hello-2.10 +: ... +#+end_src + +means binary installs. The actual installation command of 'hello' is + +#+begin_src bash +guix package -i hello +hello + Hello, world! +#+end_src + +If you actually see things building it means that Guix is not yet +properly installed and up-to-date, i.e., the key is missing or you +need to do a 'guix pull'. Press Ctrl-C to interrupt. + +If you need more help we have another writeup in [[https://github.com/pjotrp/guix-notes/blob/master/INSTALL.org#binary-installation][guix-notes]]. To get +rid of the locale warning see [[https://github.com/pjotrp/guix-notes/blob/master/INSTALL.org#set-locale][set-locale]]. + +** Step 2: Checkout the GN2 git repositories + +To fixate the software dependency graph GN2 uses git repositories of +Guix packages. First install git if it is missing + +#+begin_src bash +guix package -i git +export GIT_SSL_CAINFO=/etc/ssl/certs/ca-certificates.crt +#+end_src + +check out the git repositories (gn-deploy branch) + +#+begin_src bash +cd ~ +mkdir genenetwork +cd genenetwork +git clone --branch gn-deploy https://github.com/genenetwork/guix-bioinformatics +git clone --branch gn-deploy --recursive https://github.com/genenetwork/guix guix-gn-deploy +cd guix-gn-deploy +#+end_src bash + +To test whether this is working try: + +#+begin_src bash +#+end_src bash + +** Step 3: Authorize the GN Guix server + +GN2 has its own GNU Guix binary distribution server. To trust it you have +to add the following key + +#+begin_src scheme +(public-key + (ecc + (curve Ed25519) + (q #11217788B41ADC8D5B8E71BD87EF699C65312EC387752899FE9C888856F5C769#) + ) +) +#+end_src + +by pasting it into the command + +#+begin_src bash +guix archive --authorize +#+end_src + +and hit Ctrl-D. + +Now you can use the substitute server to install GN2 binaries. + +** Step 4: Install and run GN2 + +Since this is a quick and dirty install we are going to override the +GNU Guix package path by pointing the package path to our repository: + +#+begin_src bash +rm /root/.config/guix/latest +ln -s ~/genenetwork/guix-gn-deploy/ /root/.config/guix/latest +#+end_src + +Now check whether you can find the GN2 package with + +#+begin_src bash +env GUIX_PACKAGE_PATH=~/genenetwork/guix-bioinformatics/ guix package -A genenetwork2 + genenetwork2 2.0-a8fcff4 out gn/packages/genenetwork.scm:144:2 +#+end_src + +(ignore the source file newer then ... messages, this is caused by the +/root/.config/guix/latest override). + +And install with + +#+begin_src bash +env GUIX_PACKAGE_PATH=~/genenetwork/guix-bioinformatics/ \ + guix package -i genenetwork2 \ + --substitute-urls="http://guix.genenetwork.org" +#+end_src + +Note: the order of the substitute url's may make a difference in speed +(put the one first that is fastest for your location and time of day). + +Note: if your system starts building or gives an error it may well be +Step 3 did not succeed. The installation should actually be smooth at +this point and only do binary installs (no compiling). + +After installation you should be able to run genenetwork2 after updating +the Guix suggested environment vars. Check the output of + +#+begin_src bash +guix package --search-paths +export PYTHONPATH="/root/.guix-profile/lib/python2.7/site-packages" +export R_LIBS_SITE="/root/.guix-profile/site-library/" +#+end_src + +and copy-paste the listed exports into the terminal before running: + +#+begin_src bash +genenetwork2 +#+end_src + +It will complain that the database is missing. See the next section on +running MySQL server for downloading and installing a MySQL GN2 +database. After installing the database restart genenetwork2 and point +your browser at [[http://localhost:5003/]]. + +End of the GN2 installation recipe! * From source deployment @@ -52,3 +221,182 @@ gn-stable-guix$ env GUIX_PACKAGE_PATH=../guix-bioinformatics ./pre-inst-env guix * Create archive : env GUIX_PACKAGE_PATH=../../genenetwork/guix-bioinformatics/ ./pre-inst-env guix archive --export -r genenetwork2 > guix_gn2-2.0-9e9475053.nar + + +* Source deployment + +This section gives a more elaborate instruction for installing GN2 +from source. + +First execute above 4 steps: + + - [[#step-1-install-gnu-guix][Step 1: Install GNU Guix]] + - [[#step-2-checkout-the-gn2-git-repositories][Step 2: Checkout the GN2 git repositories]] + - [[#step-3-authorize-the-gn-guix-server][Step 3: Authorize the GN Guix server]] + - [[#step-4-install-and-run-gn2-][Step 4: Install and run GN2 ]] + + +** Run your own copy of GN2 + +At some point you may want to fix the source code. Assuming you have +Guix and Genenetwork2 installed (as described above) clone the GN2 +repository from https://github.com/genenetwork/genenetwork2. + +Copy-paste the paths into your terminal (mainly so PYTHON_PATH and +R_LIBS_SITE are set) from the information given by guix: + +: guix package --search-paths + +Inside the repository: + +: cd genenetwork2 +: ./bin/genenetwork2 + +Will fire up your local repo http://localhost:5003/ using the +settings in ./etc/default_settings.py. These settings may +not reflect your system. To override settings create your own from a copy of +default_settings.py and pass it into GN2 with + +: ./bin/genenetwork2 $HOME/my_settings.py + +and everything *should* work (note the full path to the settings +file). This way we develop against the exact same dependency graph of +software. + +If something is not working, take a hint from the settings file +that comes in the Guix installation. It sits in something like + +: cat ~/.guix-profile/lib/python2.7/site-packages/genenetwork2-2.0-py2.7.egg/etc/default_settings.py + +** Set up nginx port forwarding + +nginx can be used as a reverse proxy for GN2. For example, we want to +expose GN2 on port 80 while it is running on port 5003. Essentially +the configuration looks like + +#+begin_src js + server { + listen 80; + server_name test-gn2.genenetwork.org; + access_log logs/test-gn2.access.log; + + proxy_connect_timeout 3000; + proxy_send_timeout 3000; + proxy_read_timeout 3000; + send_timeout 3000; + + location / { + proxy_set_header Host $http_host; + proxy_set_header Connection keep-alive; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Host $server_name; + proxy_pass http://127.0.0.1:5003; + } +} +#+end_src js + +Install the nginx webserver (as root) + +: guix package -i nginx + +The nginx example configuration examples can be found in the Guix +store through + +: ls -l /root/.guix-profile/sbin/nginx +: lrwxrwxrwx 3 root guixbuild 66 Dec 31 1969 /root/.guix-profile/sbin/nginx -> /gnu/store/g0wrcl5z27rmk5b52rldzvk1bzzbnz2l-nginx-1.8.1/sbin/nginx + +Use that path + +: ls /gnu/store/g0wrcl5z27rmk5b52rldzvk1bzzbnz2l-nginx-1.8.1/share/nginx/conf/ +: fastcgi.conf koi-win scgi_params +: fastcgi.conf.default mime.types scgi_params.default +: fastcgi_params mime.types.default uwsgi_params +: fastcgi_params.default nginx.conf uwsgi_params.default +: koi-utf nginx.conf.default win-utf + +And copy any relevant files to /etc/nginx. A configuration file for +GeneNetwork (reverse proxy) port forwarding can be found in the source +repository under ./etc/nginx-genenetwork.conf. Copy this file to /etc +(still as root) +: cp ./etc/nginx-genenetwork.conf /etc/nginx/ + +Make dirs + +: mkdir -p /var/spool/nginx/logs + +Add users + +: adduser nobody ; addgroup nobody + +Run nginx + +: /root/.guix-profile/sbin/nginx -c /etc/nginx/nginx-genenetwork.conf -p /var/spool/nginx + +* Source deployment and other information on reproducibility + +See the document [[GUIX-Reproducible-from-source.org]]. + +** Update to recent guix + +We now compile Guix from scratch. + +Create, install and run a recent version of the guix-daemon by +compiling the guix repository you have installed with git in +step 2. Follow [[https://github.com/pjotrp/guix-notes/blob/master/INSTALL.org#building-gnu-guix-from-source-using-guix][these]] steps carefully after + +: cd ~/genenetwork/guix-gn-deploy + +Make sure to restart the guix daemon and run guix client from this +directory. + +** Install GN2 + +Reinstall genenetwork2 using the new tree + +#+begin_src bash +env GUIX_PACKAGE_PATH=~/genenetwork/guix-bioinformatics/ ./pre-inst-env guix package -i genenetwork2 --substitute-urls="http://guix.genenetwork.org https://mirror.guixsd.org" +#+end_src bash + +Note the use of ./pre-inst-env here! + +Actually, it should be the same installation as in step 4, so nothing +gets downloaded. + +** Run GN2 + +Make a note of the paths with + +#+begin_src bash +./pre-inst-env guix package --search-paths +#+end_src bash + +or this should also work if guix is installed + +#+begin_src bash +guix package --search-paths +#+end_src bash + +After setting the paths for the server + +#+begin_src bash +export PATH=~/.guix-profile/bin:$PATH +export PYTHONPATH="$HOME/.guix-profile/lib/python2.7/site-packages" +export R_LIBS_SITE="$HOME/.guix-profile/site-library/" +export GUIX_GTK3_PATH="$HOME/.guix-profile/lib/gtk-3.0" +export GI_TYPELIB_PATH="$HOME/.guix-profile/lib/girepository-1.0" +export XDG_DATA_DIRS="$HOME/.guix-profile/share" +export GIO_EXTRA_MODULES="$HOME/.guix-profile/lib/gio/modules" +#+end_src bash + +run the main script (in ~/.guix-profile/bin) + +#+begin_src bash +genenetwork2 +#+end_src bash + +will start the default server which listens on port 5003, i.e., +http://localhost:5003/. + +OK, we are where we were before with step 4. Only difference is that we +used our own compiled guix server. diff --git a/doc/README.org b/doc/README.org index b38ea664..620c946c 100644 --- a/doc/README.org +++ b/doc/README.org @@ -2,33 +2,29 @@ * Table of Contents :TOC: - [[#introduction][Introduction]] - - [[#quick-installation-recipe][Quick installation recipe]] - - [[#step-1-install-gnu-guix][Step 1: Install GNU Guix]] - - [[#step-2-checkout-the-gn2-git-repositories][Step 2: Checkout the GN2 git repositories]] - - [[#step-3-authorize-the-gn-guix-server][Step 3: Authorize the GN Guix server]] - - [[#step-4-install-and-run-gn2][Step 4: Install and run GN2]] + - [[#install][Install]] + - [[#tarball][Tarball]] + - [[#docker][Docker]] + - [[#with-source][With source]] + - [[#running-gn2][Running GN2]] - [[#run-mysql-server][Run MySQL server]] + - [[#install-mysql-with-gnu-guix][Install MySQL with GNU GUIx]] + - [[#load-the-small-database-in-mysql][Load the small database in MySQL]] - [[#gn2-dependency-graph][GN2 Dependency Graph]] - - [[#source-deployment][Source deployment]] - - [[#run-your-own-copy-of-gn2][Run your own copy of GN2]] - - [[#set-up-nginx-port-forwarding][Set up nginx port forwarding]] - - [[#source-deployment-and-other-information-on-reproducibility][Source deployment and other information on reproducibility]] - - [[#update-to-recent-guix][Update to recent guix]] - - [[#install-gn2][Install GN2]] - - [[#run-gn2][Run GN2]] + - [[#working-with-the-gn2-source-code][Working with the GN2 source code]] + - [[#running-elasticsearch][Running ElasticSearch]] + - [[#systemd][SystemD]] + - [[#read-more][Read more]] - [[#trouble-shooting][Trouble shooting]] - [[#importerror-no-module-named-jinja2][ImportError: No module named jinja2]] - - [[#error-can-not-find-directory-homegn2_data][ERROR: can not find directory $HOME/gn2_data]] + - [[#error-can-not-find-directory-homegn2_data-or-can-not-find-directory-homegenotype_filesgenotype][ERROR: 'can not find directory $HOME/gn2_data' or 'can not find directory $HOME/genotype_files/genotype']] - [[#cant-run-a-module][Can't run a module]] - [[#rpy2-error-show-now-found][Rpy2 error 'show' now found]] + - [[#mysql-cant-connect-server-through-socket-error][Mysql can't connect server through socket ERROR]] - [[#irc-session][IRC session]] * Introduction -If you want to understand the architecture of GN2 read -[[Architecture.org]]. The rest of this document is mostly on deployment -of GN2. - Large system deployments can get very [[http://biogems.info/contrib/genenetwork/gn2.svg ][complex]]. In this document we explain the GeneNetwork version 2 (GN2) reproducible deployment system which is based on GNU Guix (see also [[https://github.com/pjotrp/guix-notes/blob/master/README.md][Guix-notes]]). The Guix @@ -37,195 +33,133 @@ system can be used to install GN with all its files and dependencies. The official installation path is from a checked out version of the main Guix package tree and that of the Genenetwork package tree. Current supported versions can be found as the SHA values of -'gn-latest' branches of [[https://github.com/genenetwork/guix-bioinformatics/tree/gn-latest][Guix bioinformatics]] and [[https://github.com/genenetwork/guix/tree/gn-latest][GNU Guix main]]. +'gn-latest' branches of [[https://gitlab.com/genenetwork/guix-bioinformatics][Guix bioinformatics]] and [[https://gitlab.com/genenetwork/guix][GNU Guix]]. For a full view of runtime dependencies as defined by GNU Guix, see -the [[#gn2-dependency-graph][GN2 Dependency Graph]]. - -* Quick installation recipe - -This is a recipe for quick and dirty installation of GN2. For -convenience everything is installed as root, though in reality only -GNU Guix has to be installed as root. I tested this recipe on a fresh -install of Debian 8.3.0 (in KVM) though it should work on any modern -Linux distribution (including CentOS). For more elaborate installation -instructions see [[#source-deployment][Source deployment]]. +an example of the [[#gn2-dependency-graph][GN2 Dependency Graph]]. -Note that GN2 consists of an approx. 5 GB installation including -database. If you use a virtual machine we recommend to use at least -double. +* Install -** Step 1: Install GNU Guix +The quickest way to install GN2 is by using a binary installation +(tarball or Docker image). These installations are bundled by GNU +Guix and include all dependencies. You can install GeneNetwork on most +Linux distributions, including Debian, Ubuntu, Fedora and CentOS, +provided you have administrator privileges (root). The alternative is +a Docker installation. -Fetch the GNU Guix binary from [[https://www.gnu.org/software/guix/download/][here]] (middle panel) and follow -[[https://www.gnu.org/software/guix/manual/html_node/Binary-Installation.html][instructions]]. Essentially, download and unpack the tar ball (which -creates directories in /gnu and /var/guix), add build users and group -(Guix builds software as unpriviliged users) and run the Guix daemon -after fixing the paths (also known as the 'profile'). +** Tarball -Once you have succeeded, you have to [[https://github.com/pjotrp/guix-notes/blob/master/INSTALL.org#set-the-key][set the key]] (getting permission -to download binaries from the GNU server) and you should be able to -install the hello package using binary packages (no building) +Download the ~800Mb tarball from +[[http://files.genenetwork.org/software/binary_tarball/]]. Validate the checksum and +unpack to root, for example -#+begin_src bash -export PATH=~/.guix-profile/bin:$PATH -guix pull -guix package -i hello --dry-run -#+end_src - -Which should show something like +: tar xvzf genenetwork2-2.10rc3-1538ffd-tarball-pack.tar.gz +: mv /gnu / +: mv /opt/genenetwork2 /opt/ -: The following files would be downloaded: -: /gnu/store/zby49aqfbd9w9br4l52mvb3y6f9vfv22-hello-2.10 -: ... -#+end_src +Now you shoud be able to start the server with -means binary installs. The actual installation command of 'hello' is +: /opt/genenetwork2/bin/genenetwork2 -#+begin_src bash -guix package -i hello -hello - Hello, world! -#+end_src +When the server stops with a MySQL error [[#run-mysql-server][Run MySQL server]] +and set SQL_URI to point at it. For example: -If you actually see things building it means that Guix is not yet -properly installed and up-to-date, i.e., the key is missing or you -need to do a 'guix pull'. Press Ctrl-C to interrupt. +: export SQL_URI=mysql://gn2:mysql_password@127.0.0.1/db_webqtl_s -If you need more help we have another writeup in [[https://github.com/pjotrp/guix-notes/blob/master/INSTALL.org#binary-installation][guix-notes]]. To get -rid of the locale warning see [[https://github.com/pjotrp/guix-notes/blob/master/INSTALL.org#set-locale][set-locale]]. +See also [[#mysql-cant-connect-server-through-socket-error][Mysql can't connect server through socket ERROR]]. -** Step 2: Checkout the GN2 git repositories +** Docker -To fixate the software dependency graph GN2 uses git repositories of -Guix packages. First install git if it is missing +Docker images are also available through +[[http://files.genenetwork.org/software/]]. Validate the checksum and run +with [[https://docs.docker.com/engine/reference/commandline/load/][Docker load]]. -#+begin_src bash -guix package -i git -export GIT_SSL_CAINFO=/etc/ssl/certs/ca-certificates.crt -#+end_src +** With source -check out the git repositories (gn-deploy branch) +For more elaborate installation instructions on deploying GeneNetwork from +source see [[#source-deployment][Source deployment]]. -#+begin_src bash -cd ~ -mkdir genenetwork -cd genenetwork -git clone --branch gn-deploy https://github.com/genenetwork/guix-bioinformatics -git clone --branch gn-deploy --recursive https://github.com/genenetwork/guix guix-gn-deploy -cd guix-gn-deploy -#+end_src bash +* Running GN2 -To test whether this is working try: +Default settings for GN2 are listed in a file called +[[../etc/default_settings.py][default_settings.py]]. You can copy this file and pass it as a new +parameter to the genenetwork2 command, e.g. -#+begin_src bash -#+end_src bash +: genenetwork2 mysettings.py +or you can set environment variables to override individual parameters, e.g. -** Step 3: Authorize the GN Guix server +: env SERVER_PORT=5004 SQL_URI=mysql://user:pwd@dbhostname/db_webqtl genenetwork2 -GN2 has its own GNU Guix binary distribution server. To trust it you have -to add the following key +the debug and logging switches can be particularly useful when +developing GN2. -#+begin_src scheme -(public-key - (ecc - (curve Ed25519) - (q #11217788B41ADC8D5B8E71BD87EF699C65312EC387752899FE9C888856F5C769#) - ) -) -#+end_src - -by pasting it into the command - -#+begin_src bash -guix archive --authorize -#+end_src - -and hit Ctrl-D. - -Now you can use the substitute server to install GN2 binaries. - -** Step 4: Install and run GN2 - -Since this is a quick and dirty install we are going to override the -GNU Guix package path by pointing the package path to our repository: - -#+begin_src bash -rm /root/.config/guix/latest -ln -s ~/genenetwork/guix-gn-deploy/ /root/.config/guix/latest -#+end_src +* Run MySQL server +** Install MySQL with GNU GUIx -Now check whether you can find the GN2 package with +These are the steps you can take to install a fresh installation of +mysql (which comes as part of the GNU Guix genenetwork2 install). -#+begin_src bash -env GUIX_PACKAGE_PATH=~/genenetwork/guix-bioinformatics/ guix package -A genenetwork2 - genenetwork2 2.0-a8fcff4 out gn/packages/genenetwork.scm:144:2 -#+end_src +As root configure and run -(ignore the source file newer then ... messages, this is caused by the -/root/.config/guix/latest override). +#+BEGIN_SRC bash +adduser mysql && addgroup mysql +mysqld --datadir=/var/mysql --initialize-insecure +mkdir -p /var/run/mysqld +chown mysql.mysql ~/mysql /var/run/mysqld +mysqld -u mysql --datadir=/var/mysql --explicit_defaults_for_timestamp -P 12048" +#+END_SRC -And install with +If you want to run as root you may have to set -#+begin_src bash -env GUIX_PACKAGE_PATH=~/genenetwork/guix-bioinformatics/ \ - guix package -i genenetwork2 \ - --substitute-urls="http://guix.genenetwork.org" -#+end_src +: /etc/my.cnf +: [mysqld] +: user=root -Note: the order of the substitute url's may make a difference in speed -(put the one first that is fastest for your location and time of day). +To check error output in a file on start-up run with something like -Note: if your system starts building or gives an error it may well be -Step 3 did not succeed. The installation should actually be smooth at -this point and only do binary installs (no compiling). +: mysqld -u mysql --console --explicit_defaults_for_timestamp --datadir=/gnu/mysql --log-error=~/test.log -After installation you should be able to run genenetwork2 after updating -the Guix suggested environment vars. Check the output of +Other tips are that Guix installs mysqld in your profile, so this may work -#+begin_src bash -guix package --search-paths -export PYTHONPATH="/root/.guix-profile/lib/python2.7/site-packages" -export R_LIBS_SITE="/root/.guix-profile/site-library/" -#+end_src +: /home/user/.guix-profile/bin/mysqld -u mysql --explicit_defaults_for_timestamp --datadir=/gnu/mysql -and copy-paste the listed exports into the terminal before running: +When you get errors like: -#+begin_src bash -genenetwork2 -#+end_src +: qlalchemy.exc.IntegrityError: (_mysql_exceptions.IntegrityError) (1215, 'Cannot add foreign key constraint') -It will complain that the database is missing. See the next section on -running MySQL server for downloading and installing a MySQL GN2 -database. After installing the database restart genenetwork2 and point -your browser at [[http://localhost:5003/]]. +you may need to set -End of the GN2 installation recipe! +: set foreign_key_checks=0 -* Run MySQL server +** Load the small database in MySQL At this point we require the underlying distribution to install and -run mysqld. Currently we have two databases for deployment, +run mysqld (see next section for GNU Guix). Currently we have two databases for deployment, 'db_webqtl_s' is the small testing database containing experiments from BXD mice and 'db_webqtl_plant' which contains all plant related material. Download one database from -http://files.genenetwork.org/raw_database/ -https://s3.amazonaws.com/genenetwork2/db_webqtl_s.zip +[[http://files.genenetwork.org/raw_database/]] + +[[https://s3.amazonaws.com/genenetwork2/db_webqtl_s.zip]] Check the md5sum. After installation inflate the database binary in the MySQL directory -(this installation path is subject to change soon) +: cd ~/mysql : chown -R mysql:mysql db_webqtl_s/ : chmod 700 db_webqtl_s/ : chmod 660 db_webqtl_s/* -restart MySQL service (mysqld). Login as root and +restart MySQL service (mysqld). Login as root + +: myslq -u root + +and : mysql> show databases; : +--------------------+ @@ -241,9 +175,12 @@ Set permissions and match password in your settings file below: : mysql> grant all privileges on db_webqtl_s.* to gn2@"localhost" identified by 'mysql_password'; +You may need to change "localhost" to whatever domain you are +connecting from (mysql will give an error). + Note that if the mysql connection is not working, try connecting to the IP address and check server firewall, hosts.allow and mysql IP -configuration. +configuration (see below). Note for the plant database you can rename it to db_webqtl_s, or change the settings in etc/default_settings.py to match your path. @@ -255,183 +192,44 @@ Graph of all runtime dependencies as installed by GNU Guix. #+ATTR_HTML: :title GN2_graph http://biogems.info/contrib/genenetwork/gn2.svg -* Source deployment - -This section gives a more elaborate instruction for installing GN2 -from source. - -First execute above 4 steps: - - - [[#step-1-install-gnu-guix][Step 1: Install GNU Guix]] - - [[#step-2-checkout-the-gn2-git-repositories][Step 2: Checkout the GN2 git repositories]] - - [[#step-3-authorize-the-gn-guix-server][Step 3: Authorize the GN Guix server]] - - [[#step-4-install-and-run-gn2-][Step 4: Install and run GN2 ]] - - -** Run your own copy of GN2 - -At some point you may want to fix the source code. Assuming you have -Guix and Genenetwork2 installed (as described above) clone the GN2 -repository from https://github.com/genenetwork/genenetwork2. - -Copy-paste the paths into your terminal (mainly so PYTHON_PATH and -R_LIBS_SITE are set) from the information given by guix: - -: guix package --search-paths - -Inside the repository: - -: cd genenetwork2 -: ./bin/genenetwork2 - -Will fire up your local repo http://localhost:5003/ using the -settings in ./etc/default_settings.py. These settings may -not reflect your system. To override settings create your own from a copy of -default_settings.py and pass it into GN2 with - -: ./bin/genenetwork2 $HOME/my_settings.py - -and everything *should* work (note the full path to the settings -file). This way we develop against the exact same dependency graph of -software. - -If something is not working, take a hint from the settings file -that comes in the Guix installation. It sits in something like +* Working with the GN2 source code -: cat ~/.guix-profile/lib/python2.7/site-packages/genenetwork2-2.0-py2.7.egg/etc/default_settings.py +See [[development.org]]. -** Set up nginx port forwarding +* Running ElasticSearch -nginx can be used as a reverse proxy for GN2. For example, we want to -expose GN2 on port 80 while it is running on port 5003. Essentially -the configuration looks like +In order to start up elasticsearch: +Penguin - change user to "elasticsearch" and use the following command: "env JAVA_HOME=/opt/jdk-9.0.4 /opt/elasticsearch-6.2.1/bin/elasticsearch" -#+begin_src js - server { - listen 80; - server_name test-gn2.genenetwork.org; - access_log logs/test-gn2.access.log; - proxy_connect_timeout 3000; - proxy_send_timeout 3000; - proxy_read_timeout 3000; - send_timeout 3000; +** SystemD - location / { - proxy_set_header Host $http_host; - proxy_set_header Connection keep-alive; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Host $server_name; - proxy_pass http://127.0.0.1:5003; - } -} -#+end_src js +New server - as root run "systemctl restart elasticsearch" -Install the nginx webserver (as root) +#+BEGIN_SRC +tux01:/etc/systemd/system# cat elasticsearch.service +[Unit] +Description=Run Elasticsearch -: guix package -i nginx +[Service] +ExecStart=/opt/elasticsearch-6.2.1/bin/elasticsearch +Environment=JAVA_HOME=/opt/jdk-9.0.4 +Environment="ES_JAVA_OPTS=-Xms1g -Xmx8g" +Environment="PATH=/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/opt/jdk-9.0.4/bin" +LimitNOFILE=65536 +StandardOutput=syslog +StandardError=syslog +User=elasticsearch -The nginx example configuration examples can be found in the Guix -store through +[Install] +WantedBy=multi-user.target +#+END_SRC -: ls -l /root/.guix-profile/sbin/nginx -: lrwxrwxrwx 3 root guixbuild 66 Dec 31 1969 /root/.guix-profile/sbin/nginx -> /gnu/store/g0wrcl5z27rmk5b52rldzvk1bzzbnz2l-nginx-1.8.1/sbin/nginx +* Read more -Use that path - -: ls /gnu/store/g0wrcl5z27rmk5b52rldzvk1bzzbnz2l-nginx-1.8.1/share/nginx/conf/ -: fastcgi.conf koi-win scgi_params -: fastcgi.conf.default mime.types scgi_params.default -: fastcgi_params mime.types.default uwsgi_params -: fastcgi_params.default nginx.conf uwsgi_params.default -: koi-utf nginx.conf.default win-utf - -And copy any relevant files to /etc/nginx. A configuration file for -GeneNetwork (reverse proxy) port forwarding can be found in the source -repository under ./etc/nginx-genenetwork.conf. Copy this file to /etc -(still as root) -: cp ./etc/nginx-genenetwork.conf /etc/nginx/ - -Make dirs - -: mkdir -p /var/spool/nginx/logs - -Add users - -: adduser nobody ; addgroup nobody - -Run nginx - -: /root/.guix-profile/sbin/nginx -c /etc/nginx/nginx-genenetwork.conf -p /var/spool/nginx - -* Source deployment and other information on reproducibility - -See the document [[GUIX-Reproducible-from-source.org]]. - -** Update to recent guix - -We now compile Guix from scratch. - -Create, install and run a recent version of the guix-daemon by -compiling the guix repository you have installed with git in -step 2. Follow [[https://github.com/pjotrp/guix-notes/blob/master/INSTALL.org#building-gnu-guix-from-source-using-guix][these]] steps carefully after - -: cd ~/genenetwork/guix-gn-deploy - -Make sure to restart the guix daemon and run guix client from this -directory. - -** Install GN2 - -Reinstall genenetwork2 using the new tree - -#+begin_src bash -env GUIX_PACKAGE_PATH=~/genenetwork/guix-bioinformatics/ ./pre-inst-env guix package -i genenetwork2 --substitute-urls="http://guix.genenetwork.org https://mirror.guixsd.org" -#+end_src bash - -Note the use of ./pre-inst-env here! - -Actually, it should be the same installation as in step 4, so nothing -gets downloaded. - -** Run GN2 - -Make a note of the paths with - -#+begin_src bash -./pre-inst-env guix package --search-paths -#+end_src bash - -or this should also work if guix is installed - -#+begin_src bash -guix package --search-paths -#+end_src bash - -After setting the paths for the server - -#+begin_src bash -export PATH=~/.guix-profile/bin:$PATH -export PYTHONPATH="$HOME/.guix-profile/lib/python2.7/site-packages" -export R_LIBS_SITE="$HOME/.guix-profile/site-library/" -export GUIX_GTK3_PATH="$HOME/.guix-profile/lib/gtk-3.0" -export GI_TYPELIB_PATH="$HOME/.guix-profile/lib/girepository-1.0" -export XDG_DATA_DIRS="$HOME/.guix-profile/share" -export GIO_EXTRA_MODULES="$HOME/.guix-profile/lib/gio/modules" -#+end_src bash - -run the main script (in ~/.guix-profile/bin) - -#+begin_src bash -genenetwork2 -#+end_src bash - -will start the default server which listens on port 5003, i.e., -http://localhost:5003/. - -OK, we are where we were before with step 4. Only difference is that we -used our own compiled guix server. +If you want to understand the architecture of GN2 read +[[Architecture.org]]. The rest of this document is mostly on deployment +of GN2. * Trouble shooting @@ -451,13 +249,17 @@ On one system: : export GEM_PATH="$HOME/.guix-profile/lib/ruby/gems/2.2.0" and perhaps a few more. -** ERROR: can not find directory $HOME/gn2_data +** ERROR: 'can not find directory $HOME/gn2_data' or 'can not find directory $HOME/genotype_files/genotype' The default settings file looks in your $HOME/gn2_data. Since these files come with a Guix installation you should take a hint from the values in the installed version of default_settings.py (see above in this document). +You can use the GENENETWORK_FILES switch to set the datadir, for example + +: env GN2_PROFILE=~/opt/gn-latest GENENETWORK_FILES=/gnu/data/gn2_data ./bin/genenetwork2 + ** Can't run a module In rare cases, development modules are not brought in with Guix @@ -479,6 +281,26 @@ R_LIBS_SITE. Please check your GNU Guix GN2 installation paths, you man need to reinstall. Note that this may be the point you may want to start using profiles (see profile section). +** Mysql can't connect server through socket ERROR + +The following error + +: sqlalchemy.exc.OperationalError: (_mysql_exceptions.OperationalError) (2002, 'Can\'t connect to local MySQL server through socket \'/run/mysqld/mysqld.sock\' (2 "No such file or directory")') + +means that MySQL is trying to connect locally to a non-existent MySQL +server, something you may see in a container. Typically replicated with something like + +: mysql -h localhost + +try to connect over the network interface instead, e.g. + +: mysql -h 127.0.0.1 + +if that works run genenetwork after setting SQL_URI to something like + +: export SQL_URI=mysql://gn2:mysql_password@127.0.0.1/db_webqtl_s + + * IRC session Here an IRC session where we installed GN2 from scratch using GNU Guix diff --git a/doc/database.org b/doc/database.org index 624174a4..5107b660 100644 --- a/doc/database.org +++ b/doc/database.org @@ -1,9 +1,19 @@ -- github Document reduction issue +* Database Information + +WARNING: This document contains information on the GN databases which +will change over time. The GN database is currently MySQL based and, +while efficient, contains a number of design choices we want to grow +'out' of. Especially with an eye on reproducibility we want to +introduce versioning. + +So do not treat the information in this document as a final way of +accessing data. It is better to use the +[[https://github.com/genenetwork/gn_server/blob/master/doc/API.md][REST API]]. * The small test database (2GB) The default install comes with a smaller database which includes a -number of the BSD's and the Human liver dataset (GSE9588). +number of the BXD's and the Human liver dataset (GSE9588). * GeneNetwork database @@ -750,9 +760,30 @@ show indexes from ProbeSetFreeze; | 1 | 5 | 0.303492 | +--------+----------+----------+ -** Publication and publishdata (all pheno) +** Publication + +Publication: + +| Id | PubMed_ID | Abstract | Title | Pages | Month | Year | + -Phenotype pubs +** Publishdata (all pheno) + +One of three phenotype tables. + +mysql> select * from PublishData limit 5; ++---------+----------+-------+ +| Id | StrainId | value | ++---------+----------+-------+ +| 8966353 | 349 | 29.6 | +| 8966353 | 350 | 27.8 | +| 8966353 | 351 | 26.6 | +| 8966353 | 352 | 28.5 | +| 8966353 | 353 | 24.6 | ++---------+----------+-------+ +5 rows in set (0.25 sec) + +See below for phenotype access. ** QuickSearch @@ -1073,7 +1104,37 @@ select * from ProbeSetXRef limit 5; i.e., for Strain Id 1 (DataId) 1, the locus '10.095.400' has a phenotype value of 5.742. -GeneNetwork1 already has a limited REST interface, if you do +Interestingly ProbeData and PublishData have the same layout as +ProbeSetData. ProbeData is only in use for Affy assays - and not used +for computations. PublishData contains trait values. ProbeSetData.id +matches ProbeSetXRef.DataId while PublishData.id matches +PublishXRef.DataId. + +select * from PublishXRef limit 3; ++-------+-------------+-------------+---------------+---------+----------------+------------------+-----------+----------+-------------------------------------------------------+ +| Id | InbredSetId | PhenotypeId | PublicationId | DataId | Locus | LRS | additive | Sequence | comments | ++-------+-------------+-------------+---------------+---------+----------------+------------------+-----------+----------+-------------------------------------------------------+ +| 10001 | 8 | 1 | 1 | 8966353 | D2Mit5 | 10.18351644706 | -1.20875 | 1 | | +| 10001 | 7 | 2 | 53 | 8966813 | D7Mit25UT | 9.85534330983917 | -2.86875 | 1 | | +| 10001 | 4 | 3 | 81 | 8966947 | CEL-6_57082524 | 11.7119505898121 | -23.28875 | 1 | elissa modified Abstract at Tue Jun 7 11:38:00 2005 | ++-------+-------------+-------------+---------------+---------+----------------+------------------+-----------+----------+-------------------------------------------------------+ +3 rows in set (0.00 sec) + +ties the trait data (PublishData) with the inbredsetid (matching +PublishFreeze.InbredSetId), locus and publication. + +select * from PublishFreeze -> ; ++----+------------+--------------------------+-------------+------------+--------+-------------+-----------------+-----------------+ +| Id | Name | FullName | ShortName | CreateTime | public | InbredSetId | confidentiality | AuthorisedUsers | ++----+------------+--------------------------+-------------+------------+--------+-------------+-----------------+-----------------+ +| 1 | BXDPublish | BXD Published Phenotypes | BXDPublish | 2004-07-17 | 2 | 1 | 0 | NULL | +| 18 | HLCPublish | HLC Published Phenotypes | HLC Publish | 2012-02-20 | 2 | 34 | 0 | NULL | ++----+------------+--------------------------+-------------+------------+--------+-------------+-----------------+-----------------+ +2 rows in set (0.02 sec) + +which gives us the datasets. + +GeneNetwork1 has a limited REST interface, if you do : curl "http://robot.genenetwork.org/webqtl/main.py?cmd=get&probeset=1443823_s_at&db=HC_M2_0606_P" @@ -1082,6 +1143,9 @@ we get : ProbeSetID B6D2F1 C57BL/6J DBA/2J BXD1 BXD2 BXD5 BXD6 BXD8 BXD9 BXD11 BXD12 BXD13 BXD15 BXD16 BXD19 BXD20 BXD21 BXD22 BXD23 BXD24 BXD27 BXD28 BXD29 BXD31 BXD32 BXD33 BXD34 BXD38 BXD39 BXD40 BXD42 BXD67 BXD68 BXD43 BXD44 BXD45 BXD48 BXD50 BXD51 BXD55 BXD60 BXD61 BXD62 BXD63 BXD64 BXD65 BXD66 BXD69 BXD70 BXD73 BXD74 BXD75 BXD76 BXD77 BXD79 BXD73a BXD83 BXD84 BXD85 BXD86 BXD87 BXD89 BXD90 BXD65b BXD93 BXD94 A/J AKR/J C3H/HeJ C57BL/6ByJ CXB1 CXB2 CXB3 CXB4 CXB5 CXB6 CXB7 CXB8 CXB9 CXB10 CXB11 CXB12 CXB13 BXD48a 129S1/SvImJ BALB/cJ BALB/cByJ LG/J NOD/ShiLtJ PWD/PhJ BXD65a BXD98 BXD99 CAST/EiJ KK/HlJ WSB/EiJ NZO/HlLtJ PWK/PhJ D2B6F1 : 1443823_s_at 15.251 15.626 14.716 15.198 14.918 15.057 15.232 14.968 14.87 15.084 15.192 14.924 15.343 15.226 15.364 15.36 14.792 14.908 15.344 14.948 15.08 15.021 15.176 15.14 14.796 15.443 14.636 14.921 15.22 15.62 14.816 15.39 15.428 14.982 15.05 15.13 14.722 14.636 15.242 15.527 14.825 14.416 15.125 15.362 15.226 15.176 15.328 14.895 15.141 15.634 14.922 14.764 15.122 15.448 15.398 15.089 14.765 15.234 15.302 14.774 14.979 15.212 15.29 15.012 15.041 15.448 14.34 14.338 14.809 15.046 14.816 15.232 14.933 15.255 15.21 14.766 14.8 15.506 15.749 15.274 15.599 15.673 14.651 14.692 14.552 14.563 14.164 14.546 15.044 14.695 15.162 14.772 14.645 15.493 14.75 14.786 15.003 15.148 15.221 +(see https://github.com/genenetwork/gn_server/blob/master/doc/API.md +for the latest REST API). + getTraitData is defined in the file [[https://github.com/genenetwork/genenetwork/blob/master/web/webqtl/textUI/cmdClass.py#L134][web/webqtl/textUI/cmdClass.py]]. probe is None, so the code at line 199 is run @@ -1165,6 +1229,97 @@ select * from ProbeSetData limit 5; 5 rows in set (0.00 sec) linked by ProbeSetXRef.dataid. + +*** For PublishData: + +List datasets for BXD (InbredSetId=1): + +select * from PublishXRef where InbredSetId=1 limit 3; ++-------+-------------+-------------+---------------+---------+-----------+------------------+------------------+----------+--------------------------------------------------------------------------------+ +| Id | InbredSetId | PhenotypeId | PublicationId | DataId | Locus | LRS | additive | Sequence | comments | ++-------+-------------+-------------+---------------+---------+-----------+------------------+------------------+----------+--------------------------------------------------------------------------------+ +| 10001 | 1 | 4 | 116 | 8967043 | rs8253516 | 13.4974914158039 | 2.39444444444444 | 1 | robwilliams modified post_publication_description at Mon Jul 30 14:58:10 2012 + | +| 10002 | 1 | 10 | 116 | 8967044 | rs3666069 | 22.0042692151629 | 2.08178571428572 | 1 | robwilliams modified phenotype at Thu Oct 28 21:43:28 2010 + | +| 10003 | 1 | 15 | 116 | 8967045 | D18Mit4 | 15.5929163293343 | 19.0882352941176 | 1 | robwilliams modified phenotype at Mon May 23 20:52:19 2011 + | ++-------+-------------+-------------+---------------+---------+-----------+------------------+------------------+----------+--------------------------------------------------------------------------------+ + +where ID is the 'record' or, effectively, dataset. + +select distinct(publicationid) from PublishXRef where InbredSetId=1 limit 3; ++---------------+ +| publicationid | ++---------------+ +| 116 | +| 117 | +| 118 | ++---------------+ + +select distinct +PublishXRef.id,publicationid,phenotypeid,Phenotype.post_publication_description +from PublishXRef,Phenotype where InbredSetId=1 and +phenotypeid=Phenotype.id limit 3; ++-------+---------------+-------------+----------------------------------------------------------------------------------------------------------------------------+ +| id | publicationid | phenotypeid | post_publication_description | ++-------+---------------+-------------+----------------------------------------------------------------------------------------------------------------------------+ +| 10001 | 116 | 4 | Central nervous system, morphology: Cerebellum weight [mg] | +| 10002 | 116 | 10 | Central nervous system, morphology: Cerebellum weight after adjustment for covariance with brain size [mg] | +| 10003 | 116 | 15 | Central nervous system, morphology: Brain weight, male and female adult average, unadjusted for body weight, age, sex [mg] | ++-------+---------------+-------------+----------------------------------------------------------------------------------------------------------------------------+ + +The id field is the same that is used in the GN2 web interface and the +PublicationID ties the datasets together. + +To list trait values: + +SELECT Strain.Name, PublishData.id, PublishData.value from +(Strain,PublishData, PublishXRef) Where PublishData.StrainId = +Strain.id limit 3; + ++------+---------+-------+ +| Name | id | value | ++------+---------+-------+ +| CXB1 | 8966353 | 29.6 | +| CXB1 | 8966353 | 29.6 | +| CXB1 | 8966353 | 29.6 | ++------+---------+-------+ + +here id should match dataid again: + +SELECT Strain.Name, PublishData.id, PublishData.value from +(Strain,PublishData, PublishXRef) Where PublishData.StrainId = +Strain.id and PublishXRef.dataid=8967043 and +PublishXRef.dataid=PublishData.id limit 3; ++------+---------+-------+ +| Name | id | value | ++------+---------+-------+ +| BXD1 | 8967043 | 61.4 | +| BXD2 | 8967043 | 49 | +| BXD5 | 8967043 | 62.5 | ++------+---------+-------+ + +*** Datasets + +The REST API aims to present a unified interface for genotype and +phenotype data. Phenotype datasets appear in two major forms in the +database and we want to present them as one resource. + +Dataset names are defined in ProbeSetFreeze.name and Published.id -> +publication (we'll ignore the probe dataset that uses +ProbeFreeze.name). These tables should be meshed. It looks like the +ids are non-overlapping with the publish record IDs starting at 10,001 +(someone has been smart, though it sets the limit of probesets now to +10,000). + +The datasets are organized differently in these tables. All published +BXD data is grouped on BXDpublished with the publications as +'datasets'. So, that is how we list them in the REST API. + +To fetch all the datasets we first list ProbeSetFreeze entries. Then +we list the Published entries. + ** Fetch genotype information *** SNPs diff --git a/doc/development.org b/doc/development.org new file mode 100644 index 00000000..5e6e318b --- /dev/null +++ b/doc/development.org @@ -0,0 +1,43 @@ +* Development + +** Using GN2_PROFILE + +After cloning the git source tree you can run the contained GN2 using +an existing GN2_PROFILE, i.e., use a profile that was create to run a +binary installation of GN2. This profile may be found by typing + +: which genenetwork2 +: /home/wrk/opt/gn-latest-guix/bin/genenetwork2 + +An example of running the development version would be + +: env GN2_PROFILE=/home/wrk/opt/gn-latest-guix ./bin/genenetwork2 + +Profiles are stored in /gnu/store, so you may pick one up there + +: readlink -f $(dirname $(dirname `which genenetwork2`)) +: /gnu/store/dvckpaw770b00l6rv4ijql8wrk11iypv-profile + +and use that instead. + +Note that the genenetwork2 script sets up the environment for running +the webserver. This includes path to R modules and python modules. These +are output on startup. To make sure there is no environment pollution you can + +** Javascript modules + +As of release 2.10-pre4 we Javascript modules are installed in three places: + +1. JS_GUIX_PATH: the Guix store - these are Guix pre-packaged modules +2. The git source tree (./wqflask/wqflask/static/packages/) +3. JS_GN_PATH: a local directory containing (temporary) development modules + +Packages currently in git (2) will move to JS_GUIX_PATH (1) over +time. This is to keep better track of origin updates. Putting packages +in git (2) is actively discouraged(!), unless there are GN2 specific +adaptations to the original Javascript modules. + +JS_GN_PATH (3) is for development purposes. By default is is set to +$HOME/genenetwork/javascript. Say you are working on an updated +version of a JS module not yet in (1) you can simply check out that +module in that path and it should show up. diff --git a/doc/elasticsearch.org b/doc/elasticsearch.org new file mode 100644 index 00000000..864a8363 --- /dev/null +++ b/doc/elasticsearch.org @@ -0,0 +1,247 @@ +* Elasticsearch + +** Introduction + +GeneNetwork uses elasticsearch (ES) for all things considered +'state'. One example is user collections, another is user management. + +** Example + +To get the right environment, first you can get a python REPL with something like + +: env GN2_PROFILE=~/opt/gn-latest ./bin/genenetwork2 ../etc/default_settings.py -cli python + +(make sure to use the correct GN2_PROFILE!) + +Next try + +#+BEGIN_SRC python + +from elasticsearch import Elasticsearch, TransportError + +es = Elasticsearch([{ "host": 'localhost', "port": '9200' }]) + +# Dump all data + +es.search("*") + +# To fetch an E-mail record from the users index + +record = es.search( + index = 'users', doc_type = 'local', body = { + "query": { "match": { "email_address": "myname@email.com" } } + }) + +# It is also possible to do wild card matching + +q = { "query": { "wildcard" : { "full_name" : "pjot*" } }} +es.search(index = 'users', doc_type = 'local', body = q) + +# To get elements from that record: + +record['hits']['hits'][0][u'_source']['full_name'] +u'Pjotr' + +record['hits']['hits'][0][u'_source']['email_address'] +u"myname@email.com" + +#+END_SRC + +** Health + +ES provides support for checking its health: + +: curl -XGET http://localhost:9200/_cluster/health?pretty=true + +#+BEGIN_SRC json + + + { + "cluster_name" : "asgard", + "status" : "yellow", + "timed_out" : false, + "number_of_nodes" : 1, + "number_of_data_nodes" : 1, + "active_primary_shards" : 5, + "active_shards" : 5, + "relocating_shards" : 0, + "initializing_shards" : 0, + "unassigned_shards" : 5 + } + +#+END_SRC + +Yellow means just one instance is running (no worries). + +To get full cluster info + +: curl -XGET "localhost:9200/_cluster/stats?human&pretty" + +#+BEGIN_SRC json +{ + "_nodes" : { + "total" : 1, + "successful" : 1, + "failed" : 0 + }, + "cluster_name" : "elasticsearch", + "timestamp" : 1529050366452, + "status" : "yellow", + "indices" : { + "count" : 3, + "shards" : { + "total" : 15, + "primaries" : 15, + "replication" : 0.0, + "index" : { + "shards" : { + "min" : 5, + "max" : 5, + "avg" : 5.0 + }, + "primaries" : { + "min" : 5, + "max" : 5, + "avg" : 5.0 + }, + "replication" : { + "min" : 0.0, + "max" : 0.0, + "avg" : 0.0 + } + } + }, + "docs" : { + "count" : 14579, + "deleted" : 0 + }, + "store" : { + "size" : "44.7mb", + "size_in_bytes" : 46892794 + }, + "fielddata" : { + "memory_size" : "0b", + "memory_size_in_bytes" : 0, + "evictions" : 0 + }, + "query_cache" : { + "memory_size" : "0b", + "memory_size_in_bytes" : 0, + "total_count" : 0, + "hit_count" : 0, + "miss_count" : 0, + "cache_size" : 0, + "cache_count" : 0, + "evictions" : 0 + }, + "completion" : { + "size" : "0b", + "size_in_bytes" : 0 + }, + "segments" : { + "count" : 24, + "memory" : "157.3kb", + "memory_in_bytes" : 161112, + "terms_memory" : "122.6kb", + "terms_memory_in_bytes" : 125569, + "stored_fields_memory" : "15.3kb", + "stored_fields_memory_in_bytes" : 15728, + "term_vectors_memory" : "0b", + "term_vectors_memory_in_bytes" : 0, + "norms_memory" : "10.8kb", + "norms_memory_in_bytes" : 11136, + "points_memory" : "111b", + "points_memory_in_bytes" : 111, + "doc_values_memory" : "8.3kb", + "doc_values_memory_in_bytes" : 8568, + "index_writer_memory" : "0b", + "index_writer_memory_in_bytes" : 0, + "version_map_memory" : "0b", + "version_map_memory_in_bytes" : 0, + "fixed_bit_set" : "0b", + "fixed_bit_set_memory_in_bytes" : 0, + "max_unsafe_auto_id_timestamp" : -1, + "file_sizes" : { } + } + }, + "nodes" : { + "count" : { + "total" : 1, + "data" : 1, + "coordinating_only" : 0, + "master" : 1, + "ingest" : 1 + }, + "versions" : [ + "6.2.1" + ], + "os" : { + "available_processors" : 16, + "allocated_processors" : 16, + "names" : [ + { + "name" : "Linux", + "count" : 1 + } + ], + "mem" : { + "total" : "125.9gb", + "total_in_bytes" : 135189286912, + "free" : "48.3gb", + "free_in_bytes" : 51922628608, + "used" : "77.5gb", + "used_in_bytes" : 83266658304, + "free_percent" : 38, + "used_percent" : 62 + } + }, + "process" : { + "cpu" : { + "percent" : 0 + }, + "open_file_descriptors" : { + "min" : 415, + "max" : 415, + "avg" : 415 + } + }, + "jvm" : { + "max_uptime" : "1.9d", + "max_uptime_in_millis" : 165800616, + "versions" : [ + { + "version" : "9.0.4", + "vm_name" : "OpenJDK 64-Bit Server VM", + "vm_version" : "9.0.4+11", + "vm_vendor" : "Oracle Corporation", + "count" : 1 + } + ], + "mem" : { + "heap_used" : "1.1gb", + "heap_used_in_bytes" : 1214872032, + "heap_max" : "23.8gb", + "heap_max_in_bytes" : 25656426496 + }, + "threads" : 110 + }, + "fs" : { + "total" : "786.4gb", + "total_in_bytes" : 844400918528, + "free" : "246.5gb", + "free_in_bytes" : 264688160768, + "available" : "206.5gb", + "available_in_bytes" : 221771468800 + }, + "plugins" : [ ], + "network_types" : { + "transport_types" : { + "netty4" : 1 + }, + "http_types" : { + "netty4" : 1 + } + } + } +} +#+BEGIN_SRC json |