Merge pull request #270 from pjotrp/testing

Merge some older commits
author: zsloan 2018-01-04 14:10:02 -0600
committer: GitHub 2018-01-04 14:10:02 -0600
commit: 56065c9f994c9247eaa17e5216d44d0e5e733aa2 (patch)
tree: c0efbb873b77eeacdc6da7edc62a934c8088eced
parent: f6ad049db84a83baad9bbf863244b174742380fc (diff)
parent: a2325f723052ff951200020f9b072a2dd5140c01 (diff)
download: genenetwork2-56065c9f994c9247eaa17e5216d44d0e5e733aa2.tar.gz
10 files changed, 236 insertions, 25 deletions
diff --git a/VERSION b/VERSION
index 3e0b7cab..a9a7884c 100644..120000
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.10-pre4
+etc/VERSION
+\ No newline at end of file
diff --git a/bin/genenetwork2 b/bin/genenetwork2
index 5e791885..a7edb1c2 100755
--- a/bin/genenetwork2
+++ b/bin/genenetwork2
@@ -1,5 +1,11 @@
 #! /bin/bash
 #
+# Typical usage
+#
+#   env GN2_PROFILE=~/opt/genenetwork2-phewas ./bin/genenetwork2
+#
+# Where GN2_PROFILE points to the GNU Guix profile used for deployment.
+#
 # This will run the GN2 server (with default settings if none
 # supplied). Typically you need a GNU Guix profile which is set with
 # an environment variable (this profile is dictated by the
@@ -120,6 +126,9 @@ echo -n "dir $TMPDIR
 dbfilename gn2.rdb
 " | redis-server - &
 
+# Overrides for packages that are not yet public (currently r-auwerx)
+export R_LIBS_SITE=$R_LIBS_SITE:$HOME/.Rlibs/das1i1pm54dj6lbdcsw5w0sdwhccyj1a-r-3.3.2/lib/R/lib
+
 # Start the flask server running GN2
 cd $GN2_BASE_DIR/wqflask
 echo "Starting with $settings"
diff --git a/doc/README.org b/doc/README.org
index a39ef603..937a9549 100644
--- a/doc/README.org
+++ b/doc/README.org
@@ -104,11 +104,29 @@ As root configure and run
 :  mysqld --datadir=/var/mysql --initialize-insecure
 :  mkdir -p /var/run/mysqld
 :  chown mysql.mysql ~/mysql /var/run/mysqld
-:  su mysql -c mysqld --datadir=/var/mysql --explicit_defaults_for_timestamp -P 12048
+:  mysqld -u mysql --datadir=/var/mysql --explicit_defaults_for_timestamp -P 12048"
 
-/etc/my.cnf
-[mysqld]
-user=root
+If you want to run as root you may have to set
+
+: /etc/my.cnf
+: [mysqld]
+: user=root
+
+To check error output in a file on start-up run with something like
+
+: mysqld -u mysql --console  --explicit_defaults_for_timestamp  --datadir=/gnu/mysql --log-error=~/test.log
+
+Other tips are that Guix installs mysqld in your profile, so this may work
+
+: /home/user/.guix-profile/bin/mysqld -u mysql --explicit_defaults_for_timestamp  --datadir=/gnu/mysql
+
+When you get errors like:
+
+: qlalchemy.exc.IntegrityError: (_mysql_exceptions.IntegrityError) (1215, 'Cannot add foreign key constraint')
+
+you may need to set
+
+: set foreign_key_checks=0
 
 ** Load the small database in MySQL
 
diff --git a/doc/database.org b/doc/database.org
index 624174a4..5107b660 100644
--- a/doc/database.org
+++ b/doc/database.org
@@ -1,9 +1,19 @@
-- github Document reduction issue
+* Database Information
+
+WARNING: This document contains information on the GN databases which
+will change over time. The GN database is currently MySQL based and,
+while efficient, contains a number of design choices we want to grow
+'out' of. Especially with an eye on reproducibility we want to
+introduce versioning.
+
+So do not treat the information in this document as a final way of
+accessing data. It is better to use the
+[[https://github.com/genenetwork/gn_server/blob/master/doc/API.md][REST API]].
 
 * The small test database (2GB)
 
 The default install comes with a smaller database which includes a
-number of the BSD's and the Human liver dataset (GSE9588).
+number of the BXD's and the Human liver dataset (GSE9588).
 
 * GeneNetwork database
 
@@ -750,9 +760,30 @@ show indexes from ProbeSetFreeze;
 |      1 |        5 | 0.303492 |
 +--------+----------+----------+
 
-** Publication and publishdata (all pheno)
+** Publication
+
+Publication:
+
+| Id   | PubMed_ID | Abstract    | Title   | Pages   | Month | Year |
+
 
-Phenotype pubs
+** Publishdata (all pheno)
+
+One of three phenotype tables.
+
+mysql> select * from PublishData limit 5;
++---------+----------+-------+
+| Id      | StrainId | value |
++---------+----------+-------+
+| 8966353 |      349 |  29.6 |
+| 8966353 |      350 |  27.8 |
+| 8966353 |      351 |  26.6 |
+| 8966353 |      352 |  28.5 |
+| 8966353 |      353 |  24.6 |
++---------+----------+-------+
+5 rows in set (0.25 sec)
+
+See below for phenotype access.
 
 ** QuickSearch
 
@@ -1073,7 +1104,37 @@ select * from ProbeSetXRef limit 5;
 i.e., for Strain Id 1 (DataId) 1, the locus '10.095.400' has a
 phenotype value of 5.742.
 
-GeneNetwork1 already has a limited REST interface, if you do
+Interestingly ProbeData and PublishData have the same layout as
+ProbeSetData. ProbeData is only in use for Affy assays - and not used
+for computations. PublishData contains trait values. ProbeSetData.id
+matches ProbeSetXRef.DataId while PublishData.id matches
+PublishXRef.DataId.
+
+select * from PublishXRef limit 3;
++-------+-------------+-------------+---------------+---------+----------------+------------------+-----------+----------+-------------------------------------------------------+
+| Id    | InbredSetId | PhenotypeId | PublicationId | DataId  | Locus          | LRS              | additive  | Sequence | comments                                              |
++-------+-------------+-------------+---------------+---------+----------------+------------------+-----------+----------+-------------------------------------------------------+
+| 10001 |           8 |           1 |             1 | 8966353 | D2Mit5         |   10.18351644706 |  -1.20875 |        1 |                                                       |
+| 10001 |           7 |           2 |            53 | 8966813 | D7Mit25UT      | 9.85534330983917 |  -2.86875 |        1 |                                                       |
+| 10001 |           4 |           3 |            81 | 8966947 | CEL-6_57082524 | 11.7119505898121 | -23.28875 |        1 | elissa modified Abstract at Tue Jun  7 11:38:00 2005  |
++-------+-------------+-------------+---------------+---------+----------------+------------------+-----------+----------+-------------------------------------------------------+
+3 rows in set (0.00 sec)
+
+ties the trait data (PublishData) with the inbredsetid (matching
+PublishFreeze.InbredSetId), locus and publication.
+
+select * from PublishFreeze -> ;
++----+------------+--------------------------+-------------+------------+--------+-------------+-----------------+-----------------+
+| Id | Name       | FullName                 | ShortName   | CreateTime | public | InbredSetId | confidentiality | AuthorisedUsers |
++----+------------+--------------------------+-------------+------------+--------+-------------+-----------------+-----------------+
+|  1 | BXDPublish | BXD Published Phenotypes | BXDPublish  | 2004-07-17 |      2 |           1 |               0 | NULL            |
+| 18 | HLCPublish | HLC Published Phenotypes | HLC Publish | 2012-02-20 |      2 |          34 |               0 | NULL            |
++----+------------+--------------------------+-------------+------------+--------+-------------+-----------------+-----------------+
+2 rows in set (0.02 sec)
+
+which gives us the datasets.
+
+GeneNetwork1 has a limited REST interface, if you do
 
 : curl "http://robot.genenetwork.org/webqtl/main.py?cmd=get&probeset=1443823_s_at&db=HC_M2_0606_P"
 
@@ -1082,6 +1143,9 @@ we get
 : ProbeSetID      B6D2F1  C57BL/6J        DBA/2J  BXD1    BXD2    BXD5    BXD6   BXD8     BXD9    BXD11   BXD12   BXD13   BXD15   BXD16   BXD19   BXD20   BXD21  BXD22    BXD23   BXD24   BXD27   BXD28   BXD29   BXD31   BXD32   BXD33   BXD34  BXD38    BXD39   BXD40   BXD42   BXD67   BXD68   BXD43   BXD44   BXD45   BXD48  BXD50    BXD51   BXD55   BXD60   BXD61   BXD62   BXD63   BXD64   BXD65   BXD66  BXD69    BXD70   BXD73   BXD74   BXD75   BXD76   BXD77   BXD79   BXD73a  BXD83  BXD84    BXD85   BXD86   BXD87   BXD89   BXD90   BXD65b  BXD93   BXD94   A/J    AKR/J    C3H/HeJ C57BL/6ByJ      CXB1    CXB2    CXB3    CXB4    CXB5    CXB6   CXB7     CXB8    CXB9    CXB10   CXB11   CXB12   CXB13   BXD48a  129S1/SvImJ    BALB/cJ  BALB/cByJ       LG/J    NOD/ShiLtJ      PWD/PhJ BXD65a  BXD98   BXD99  CAST/EiJ KK/HlJ  WSB/EiJ NZO/HlLtJ       PWK/PhJ D2B6F1
 : 1443823_s_at    15.251  15.626  14.716  15.198  14.918  15.057  15.232  14.968 14.87    15.084  15.192  14.924  15.343  15.226  15.364  15.36   14.792  14.908 15.344   14.948  15.08   15.021  15.176  15.14   14.796  15.443  14.636  14.921 15.22    15.62   14.816  15.39   15.428  14.982  15.05   15.13   14.722  14.636 15.242   15.527  14.825  14.416  15.125  15.362  15.226  15.176  15.328  14.895 15.141   15.634  14.922  14.764  15.122  15.448  15.398  15.089  14.765  15.234 15.302   14.774  14.979  15.212  15.29   15.012  15.041  15.448  14.34   14.338 14.809   15.046  14.816  15.232  14.933  15.255  15.21   14.766  14.8    15.506 15.749   15.274  15.599  15.673  14.651  14.692  14.552  14.563  14.164  14.546 15.044   14.695  15.162  14.772  14.645  15.493  14.75   14.786  15.003  15.148 15.221
 
+(see https://github.com/genenetwork/gn_server/blob/master/doc/API.md
+for the latest REST API).
+
 getTraitData is defined in the file [[https://github.com/genenetwork/genenetwork/blob/master/web/webqtl/textUI/cmdClass.py#L134][web/webqtl/textUI/cmdClass.py]].
 probe is None, so the code at line 199 is run
 
@@ -1165,6 +1229,97 @@ select * from ProbeSetData limit 5;
 5 rows in set (0.00 sec)
 
 linked by ProbeSetXRef.dataid.
+
+*** For PublishData:
+
+List datasets for BXD (InbredSetId=1):
+
+select * from PublishXRef where InbredSetId=1 limit 3;
++-------+-------------+-------------+---------------+---------+-----------+------------------+------------------+----------+--------------------------------------------------------------------------------+
+| Id    | InbredSetId | PhenotypeId | PublicationId | DataId  | Locus     | LRS              | additive         | Sequence | comments                                                                       |
++-------+-------------+-------------+---------------+---------+-----------+------------------+------------------+----------+--------------------------------------------------------------------------------+
+| 10001 |           1 |           4 |           116 | 8967043 | rs8253516 | 13.4974914158039 | 2.39444444444444 |        1 | robwilliams modified post_publication_description at Mon Jul 30 14:58:10 2012
+ |
+| 10002 |           1 |          10 |           116 | 8967044 | rs3666069 | 22.0042692151629 | 2.08178571428572 |        1 | robwilliams modified phenotype at Thu Oct 28 21:43:28 2010
+                    |
+| 10003 |           1 |          15 |           116 | 8967045 | D18Mit4   | 15.5929163293343 | 19.0882352941176 |        1 | robwilliams modified phenotype at Mon May 23 20:52:19 2011
+                    |
++-------+-------------+-------------+---------------+---------+-----------+------------------+------------------+----------+--------------------------------------------------------------------------------+
+
+where ID is the 'record' or, effectively, dataset.
+
+select distinct(publicationid) from PublishXRef where InbredSetId=1 limit 3;
++---------------+
+| publicationid |
++---------------+
+|           116 |
+|           117 |
+|           118 |
++---------------+
+
+select distinct
+PublishXRef.id,publicationid,phenotypeid,Phenotype.post_publication_description
+from PublishXRef,Phenotype where InbredSetId=1 and
+phenotypeid=Phenotype.id limit 3;
++-------+---------------+-------------+----------------------------------------------------------------------------------------------------------------------------+
+| id    | publicationid | phenotypeid | post_publication_description                                                                                               |
++-------+---------------+-------------+----------------------------------------------------------------------------------------------------------------------------+
+| 10001 |           116 |           4 | Central nervous system, morphology: Cerebellum weight [mg]                                                                 |
+| 10002 |           116 |          10 | Central nervous system, morphology: Cerebellum weight after adjustment for covariance with brain size [mg]                 |
+| 10003 |           116 |          15 | Central nervous system, morphology: Brain weight, male and female adult average, unadjusted for body weight, age, sex [mg] |
++-------+---------------+-------------+----------------------------------------------------------------------------------------------------------------------------+
+
+The id field is the same that is used in the GN2 web interface and the
+PublicationID ties the datasets together.
+
+To list trait values:
+
+SELECT Strain.Name, PublishData.id, PublishData.value from
+(Strain,PublishData, PublishXRef) Where PublishData.StrainId =
+Strain.id limit 3;
+
++------+---------+-------+
+| Name | id      | value |
++------+---------+-------+
+| CXB1 | 8966353 |  29.6 |
+| CXB1 | 8966353 |  29.6 |
+| CXB1 | 8966353 |  29.6 |
++------+---------+-------+
+
+here id should match dataid again:
+
+SELECT Strain.Name, PublishData.id, PublishData.value from
+(Strain,PublishData, PublishXRef) Where PublishData.StrainId =
+Strain.id and PublishXRef.dataid=8967043 and
+PublishXRef.dataid=PublishData.id limit 3;
++------+---------+-------+
+| Name | id      | value |
++------+---------+-------+
+| BXD1 | 8967043 |  61.4 |
+| BXD2 | 8967043 |    49 |
+| BXD5 | 8967043 |  62.5 |
++------+---------+-------+
+
+*** Datasets
+
+The REST API aims to present a unified interface for genotype and
+phenotype data. Phenotype datasets appear in two major forms in the
+database and we want to present them as one resource.
+
+Dataset names are defined in ProbeSetFreeze.name and Published.id ->
+publication (we'll ignore the probe dataset that uses
+ProbeFreeze.name). These tables should be meshed. It looks like the
+ids are non-overlapping with the publish record IDs starting at 10,001
+(someone has been smart, though it sets the limit of probesets now to
+10,000).
+
+The datasets are organized differently in these tables. All published
+BXD data is grouped on BXDpublished with the publications as
+'datasets'. So, that is how we list them in the REST API.
+
+To fetch all the datasets we first list ProbeSetFreeze entries. Then
+we list the Published entries.
+
 ** Fetch genotype information
 
 *** SNPs
diff --git a/etc/VERSION b/etc/VERSION
index 1785aa28..b624c74a 100644
--- a/etc/VERSION
+++ b/etc/VERSION
@@ -1 +1 @@
-2.10rc3
+2.10rc5
diff --git a/etc/default_settings.py b/etc/default_settings.py
index c00f6c8f..59e22f1a 100644
--- a/etc/default_settings.py
+++ b/etc/default_settings.py
@@ -2,7 +2,7 @@
 # webserver running in developer mode with limited console
 # output. Copy this file and run it from ./bin/genenetwork2 configfile
 #
-# Note that these settings are fetched in ./wqflask/utilities/tools.py
+# Note: these settings are fetched in ./wqflask/utilities/tools.py
 # which has support for overriding them through environment variables,
 # e.g.
 #
@@ -14,8 +14,12 @@
 # Note also that in the near future we will additionally fetch
 # settings from a JSON file
 #
-# Note that values for False and 0 have to be strings here - otherwise
+# Note: values for False and 0 have to be strings here - otherwise
 # Flask won't pick them up
+#
+# For GNU Guix deployment also check the paths in
+#
+#  ~/.guix-profile/lib/python2.7/site-packages/genenetwork2-2.0-py2.7.egg/etc/default_settings.py
 
 import os
 import sys
@@ -34,7 +38,7 @@ SECURITY_RECOVERABLE = True
 SECURITY_EMAIL_SENDER = "no-reply@genenetwork.org"
 SECURITY_POST_LOGIN_VIEW = "/thank_you"
 
-SERVER_PORT = 5003
+SERVER_PORT = 5003          # running on localhost
 SECRET_HMAC_CODE = '\x08\xdf\xfa\x93N\x80\xd9\\H@\\\x9f`\x98d^\xb4a;\xc6OM\x946a\xbc\xfc\x80:*\xebc'
 
 # ---- Behavioural settings (defaults) note that logger and log levels can
@@ -42,6 +46,7 @@ SECRET_HMAC_CODE = '\x08\xdf\xfa\x93N\x80\xd9\\H@\\\x9f`\x98d^\xb4a;\xc6OM\x946a
 WEBSERVER_MODE  = 'DEV'     # Python webserver mode (DEBUG|DEV|PROD)
 WEBSERVER_BRANDING = None   # Set the branding (nyi)
 WEBSERVER_DEPLOY = None     # Deployment specifics (nyi)
+WEBSERVER_URL    = "http://localhost:"+str(SERVER_PORT)+"/" # external URL
 
 LOG_LEVEL       = 'WARNING' # Logger mode (DEBUG|INFO|WARNING|ERROR|CRITICAL)
 LOG_LEVEL_DEBUG = '0'       # logger.debugf log level (0-5, 5 = show all)
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index 6649f8af..a4eaaa2e 100644
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -332,7 +332,7 @@ class DatasetGroup(object):
         if check_plink_gemma():
             marker_class = HumanMarkers
         else:
-            marker_class = Markers            
+            marker_class = Markers
 
         if self.genofile:
             self.markers = marker_class(self.genofile[:-5])
diff --git a/wqflask/runserver.py b/wqflask/runserver.py
index 50805643..50f134db 100644
--- a/wqflask/runserver.py
+++ b/wqflask/runserver.py
@@ -22,11 +22,19 @@ ENDC  = '\033[0m'
 import os
 app.config['SECRET_KEY'] = os.urandom(24)
 
-from utility.tools import WEBSERVER_MODE,get_setting_int
+from utility.tools import WEBSERVER_MODE,get_setting_int,get_setting,get_setting_bool
 
 port = get_setting_int("SERVER_PORT")
 
-logger.info("GN2 is running. Visit %shttp://localhost:%s/%s" % (BLUE,port,ENDC))
+print("GN2 API server URL is ["+BLUE+get_setting("GN_SERVER_URL")+ENDC+"]")
+
+if get_setting_bool("USE_GN_SERVER"):
+    import requests
+    page = requests.get(get_setting("GN_SERVER_URL"))
+    if page.status_code != 200:
+        raise Exception("API server not found!")
+
+print("GN2 is running. Visit %s[http://localhost:%s/%s](%s)" % (BLUE,str(port),ENDC,get_setting("WEBSERVER_URL")))
 
 werkzeug_logger = logging.getLogger('werkzeug')
 
diff --git a/wqflask/utility/logger.py b/wqflask/utility/logger.py
index bacb0aa4..128706df 100644
--- a/wqflask/utility/logger.py
+++ b/wqflask/utility/logger.py
@@ -72,7 +72,7 @@ LOG_LEVEL_DEBUG (NYI).
     def warning(self,*args):
         """Call logging.warning for multiple args"""
         self.collect(self.logger.warning,*args)
-        self.logger.warning(self.collect(*args))
+        # self.logger.warning(self.collect(*args))
 
     def error(self,*args):
         """Call logging.error for multiple args"""
diff --git a/wqflask/utility/tools.py b/wqflask/utility/tools.py
index c5685cdd..57f97a81 100644
--- a/wqflask/utility/tools.py
+++ b/wqflask/utility/tools.py
@@ -105,7 +105,7 @@ def js_path(module=None):
     try_guix = get_setting("JS_GUIX_PATH")+"/"+module
     if valid_path(try_guix):
         return try_guix
-    raise "No JS path found for "+module+" (check JS_GN_PATH)"
+    raise "No JS path found for "+module+" (if not in Guix check JS_GN_PATH)"
 
 def pylmm_command(guess=None):
     return assert_bin(get_setting("PYLMM_COMMAND",guess))
@@ -147,9 +147,14 @@ def assert_writable_dir(dir):
         fh.close()
         os.remove(fn)
     except IOError:
-        raise Exception('Unable to write test.txt to directory ' + dir )
+        raise Exception('Unable to write test.txt to directory ' + dir)
     return dir
 
+def assert_file(fn):
+    if not valid_file(fn):
+        raise Exception('Unable to find file '+fn)
+    return fn
+
 def mk_dir(dir):
     if not valid_path(dir):
         os.makedirs(dir)
@@ -174,6 +179,9 @@ def locate(name, subdir=None):
     if subdir: sys.stderr.write(subdir)
     raise Exception("Can not locate "+name+" in "+base)
 
+def locate_phewas(name, subdir=None):
+    return locate(name,'/phewas/'+subdir)
+
 def locate_ignore_error(name, subdir=None):
     """
     Locate a static flat file in the GENENETWORK_FILES environment.
@@ -239,15 +247,16 @@ USE_GN_SERVER      = get_setting_bool('USE_GN_SERVER')
 
 GENENETWORK_FILES  = get_setting('GENENETWORK_FILES')
 JS_GUIX_PATH       = get_setting('JS_GUIX_PATH')
-# assert_dir(JS_GUIX_PATH) - don't enforce right now
+assert_dir(JS_GUIX_PATH)
 JS_GN_PATH         = get_setting('JS_GN_PATH')
 # assert_dir(JS_GN_PATH)
 
-PYLMM_COMMAND         = pylmm_command()
-GEMMA_COMMAND         = gemma_command()
+PYLMM_COMMAND      = app_set("PYLMM_COMMAND",pylmm_command())
+GEMMA_COMMAND      = app_set("GEMMA_COMMAND",gemma_command())
+PLINK_COMMAND      = app_set("PLINK_COMMAND",plink_command())
 GEMMA_WRAPPER_COMMAND = gemma_wrapper_command()
-PLINK_COMMAND         = plink_command()
-TEMPDIR               = tempdir() # defaults to UNIX TMPDIR
+TEMPDIR            = tempdir() # defaults to UNIX TMPDIR
+assert_dir(TEMPDIR)
 
 # ---- Handle specific JS modules
 JS_TWITTER_POST_FETCHER_PATH = get_setting("JS_TWITTER_POST_FETCHER_PATH",js_path("Twitter-Post-Fetcher"))
@@ -267,3 +276,10 @@ if os.environ.get('WQFLASK_OVERRIDES'):
             else:
                 OVERRIDES[k] = cmd
             logger.debug(OVERRIDES)
+
+# assert_file(PHEWAS_FILES+"/auwerx/PheWAS_pval_EMMA_norm.RData")
+# assert_dir(get_setting("JS_BIODALLIANCE"))
+# assert_file(get_setting("JS_BIODALLIANCE")+"/build/dalliance-all.js")
+# assert_file(get_setting("JS_BIODALLIANCE")+"/build/worker-all.js")
+# assert_dir(get_setting("JS_TWITTER_POST_FETCHER"))
+assert_file(JS_TWITTER_POST_FETCHER_PATH+"/js/twitterFetcher_min.js")
author	zsloan	2018-01-04 14:10:02 -0600
committer	GitHub	2018-01-04 14:10:02 -0600
commit	56065c9f994c9247eaa17e5216d44d0e5e733aa2 (patch)
tree	c0efbb873b77eeacdc6da7edc62a934c8088eced
parent	f6ad049db84a83baad9bbf863244b174742380fc (diff)
parent	a2325f723052ff951200020f9b072a2dd5140c01 (diff)
download	genenetwork2-56065c9f994c9247eaa17e5216d44d0e5e733aa2.tar.gz