From 67e8f12e103f48329d8b3e38125c0e84b9dc089d Mon Sep 17 00:00:00 2001
From: zsloan
Date: Thu, 17 May 2018 16:32:44 +0000
Subject: Added script to quantile normalize a data set and enter its
 normalized sample data into ElasticSearch

Added option to replace trait page sample/strain values with normalized ones

Began editing Lei's scatterplot code

Changed elasticsearch_tools' get_elasticsearch_connection so that it can also be used for purposes other than user authentication (by adding a "for_user" parameter)
---
 wqflask/maintenance/quantile_normalize.py | 129 ++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 wqflask/maintenance/quantile_normalize.py

(limited to 'wqflask/maintenance/quantile_normalize.py')

diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py
new file mode 100644
index 00000000..c11073fb
--- /dev/null
+++ b/wqflask/maintenance/quantile_normalize.py
@@ -0,0 +1,129 @@
+from __future__ import absolute_import, print_function, division
+
+import sys
+sys.path.insert(0,'./')
+
+from itertools import izip
+
+import MySQLdb
+import urlparse
+
+import numpy as np
+import pandas as pd
+from elasticsearch import Elasticsearch, TransportError
+from elasticsearch.helpers import bulk
+
+from flask import Flask, g, request
+
+from wqflask import app
+from utility.elasticsearch_tools import get_elasticsearch_connection
+from utility.tools import ELASTICSEARCH_HOST, ELASTICSEARCH_PORT, SQL_URI
+
+def parse_db_uri():
+    """Converts a database URI to the db name, host name, user name, and password"""
+
+    parsed_uri = urlparse.urlparse(SQL_URI)
+
+    db_conn_info = dict(
+                        db = parsed_uri.path[1:],
+                        host = parsed_uri.hostname,
+                        user = parsed_uri.username,
+                        passwd = parsed_uri.password)
+
+    print(db_conn_info)
+    return db_conn_info
+
+def create_dataframe(input_file):
+    with open(input_file) as f:
+        ncols = len(f.readline().split("\t"))
+
+    input_array = np.loadtxt(open(input_file, "rb"), delimiter="\t", skiprows=1, usecols=range(1, ncols))
+    return pd.DataFrame(input_array)
+
+#This function taken from https://github.com/ShawnLYU/Quantile_Normalize
+def quantileNormalize(df_input):
+    df = df_input.copy()
+    #compute rank
+    dic = {}
+    for col in df:
+        dic.update({col : sorted(df[col])})
+    sorted_df = pd.DataFrame(dic)
+    rank = sorted_df.mean(axis = 1).tolist()
+    #sort
+    for col in df:
+        t = np.searchsorted(np.sort(df[col]), df[col])
+        df[col] = [rank[i] for i in t]
+    return df
+
+def set_data(dataset_name):
+    orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt"
+
+    sample_list = []
+    with open(orig_file, 'r') as orig_fh, open('quant_norm.csv', 'r') as quant_fh:
+        for i, (line1, line2) in enumerate(izip(orig_fh, quant_fh)):
+            trait_dict = {}
+            sample_list = []
+            if i == 0:
+                sample_names = line1.split('\t')[1:]
+            else:
+                trait_name = line1.split('\t')[0]
+                for i, sample in enumerate(sample_names):
+                    this_sample = {
+                                    "name": sample,
+                                    "value": line1.split('\t')[i+1],
+                                    "qnorm": line2.split('\t')[i+1]
+                                  }
+                    sample_list.append(this_sample)
+                query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName
+                           FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet
+                           WHERE Species.Id = InbredSet.SpeciesId and
+                                 InbredSet.Id = ProbeFreeze.InbredSetId and
+                                 ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId and
+                                 ProbeSetFreeze.Name = '%s' and
+                                 ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId and
+                                 ProbeSetXRef.ProbeSetId = ProbeSet.Id and
+                                 ProbeSet.Name = '%s'""" % (dataset_name, line1.split('\t')[0])
+                Cursor.execute(query)
+                result_info = Cursor.fetchone()
+
+                yield {
+                    "_index": "traits",
+                    "_type": "trait",
+                    "_source": {
+                        "name": trait_name,
+                        "species": result_info[0],
+                        "group": result_info[1],
+                        "dataset": dataset_name,
+                        "dataset_fullname": result_info[2],
+                        "samples": sample_list,
+                        "transform_types": "qnorm"
+                    }
+                }
+
+if __name__ == '__main__':
+    Conn = MySQLdb.Connect(**parse_db_uri())
+    Cursor = Conn.cursor()
+
+    #es = Elasticsearch([{
+    #    "host": ELASTICSEARCH_HOST, "port": ELASTICSEARCH_PORT
+    #}], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None
+
+    es = get_elasticsearch_connection(for_user=False)
+
+    #input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt"
+    #input_df = create_dataframe(input_filename)
+    #output_df = quantileNormalize(input_df)
+
+    #output_df.to_csv('quant_norm.csv', sep='\t')
+
+    #out_filename = sys.argv[1][:-4] + '_quantnorm.txt'
+
+    #success, _ = bulk(es, set_data(sys.argv[1]))
+
+    response = es.search(
+        index = "traits", doc_type = "trait", body = {
+            "query": { "match": { "name": "ENSMUSG00000028982" } }
+        }
+    )
+
+    print(response)
\ No newline at end of file
-- 
cgit v1.2.3


From 9bb60bb18ae5ac70fe480095554796b7c18f1b6c Mon Sep 17 00:00:00 2001
From: zsloan
Date: Fri, 25 May 2018 15:52:40 +0000
Subject: Fixed issue causing anonymous collections to not work on my branch
 and staging, though still not sure why it's working on production without
 that change

Added script to convert the dryad format genotype files to BIMBAM

removed db_uri from parameters of parse_db_uri in gen_select_dataset.py, since it can now just pull it from settings as a global variable
---
 wqflask/maintenance/convert_dryad_to_bimbam.py     | 70 ++++++++++++++++++++++
 wqflask/maintenance/gen_select_dataset.py          |  2 +-
 wqflask/maintenance/quantile_normalize.py          |  4 +-
 .../new/javascript/dataset_menu_structure.json     | 34 +++++++++++
 wqflask/wqflask/user_manager.py                    | 10 +++-
 5 files changed, 116 insertions(+), 4 deletions(-)
 create mode 100644 wqflask/maintenance/convert_dryad_to_bimbam.py

(limited to 'wqflask/maintenance/quantile_normalize.py')

diff --git a/wqflask/maintenance/convert_dryad_to_bimbam.py b/wqflask/maintenance/convert_dryad_to_bimbam.py
new file mode 100644
index 00000000..e833b395
--- /dev/null
+++ b/wqflask/maintenance/convert_dryad_to_bimbam.py
@@ -0,0 +1,70 @@
+#!/usr/bin/python
+
+"""
+Convert data dryad files to a BIMBAM _geno and _snps file
+
+
+"""
+
+from __future__ import print_function, division, absolute_import
+import sys
+sys.path.append("..")
+
+
+def read_dryad_file(filename):
+    exclude_count = 0
+    marker_list = []
+    sample_dict = {}
+    sample_list = []
+    geno_rows = []
+    with open(filename, 'r') as the_file:
+        for i, line in enumerate(the_file):
+            if i > 0:
+                if line.split(" ")[1] == "no":
+                    sample_name = line.split(" ")[0]
+                    sample_list.append(sample_name)
+                    sample_dict[sample_name] = line.split(" ")[2:]
+                else:
+                    exclude_count += 1
+            else:
+                marker_list = line.split(" ")[2:]
+
+    for i, marker in enumerate(marker_list):
+        this_row = []
+        this_row.append(marker)
+        this_row.append("X")
+        this_row.append("Y")
+        for sample in sample_list:
+            this_row.append(sample_dict[sample][i])
+        geno_rows.append(this_row)
+
+    print(exclude_count)
+
+    return geno_rows
+
+    #for i, marker in enumerate(marker_list):
+    #    this_row = []
+    #    this_row.append(marker)
+    #    this_row.append("X")
+    #    this_row.append("Y")
+    #    with open(filename, 'r') as the_file:
+    #        for j, line in enumerate(the_file):
+    #            if j > 0:
+    #                this_row.append(line.split(" ")[i+2])
+    #        print("row: " + str(i))
+    #        geno_rows.append(this_row)
+    #            
+    #return geno_rows
+
+def write_bimbam_files(geno_rows):
+    with open('/home/zas1024/cfw_data/CFW_geno.txt', 'w') as geno_fh:
+        for row in geno_rows:
+            geno_fh.write(", ".join(row) + "\n")
+
+def convert_dryad_to_bimbam(filename):
+    geno_file_rows = read_dryad_file(filename)
+    write_bimbam_files(geno_file_rows)
+
+if __name__=="__main__":
+    input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt"
+    convert_dryad_to_bimbam(input_filename)
\ No newline at end of file
diff --git a/wqflask/maintenance/gen_select_dataset.py b/wqflask/maintenance/gen_select_dataset.py
index 2825c6ea..18b2dac9 100644
--- a/wqflask/maintenance/gen_select_dataset.py
+++ b/wqflask/maintenance/gen_select_dataset.py
@@ -63,7 +63,7 @@ from pprint import pformat as pf
 
 #conn = Engine.connect()
 
-def parse_db_uri(db_uri):
+def parse_db_uri():
     """Converts a database URI to the db name, host name, user name, and password"""
 
     parsed_uri = urlparse.urlparse(SQL_URI)
diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py
index c11073fb..41a3aad8 100644
--- a/wqflask/maintenance/quantile_normalize.py
+++ b/wqflask/maintenance/quantile_normalize.py
@@ -59,7 +59,7 @@ def set_data(dataset_name):
     orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt"
 
     sample_list = []
-    with open(orig_file, 'r') as orig_fh, open('quant_norm.csv', 'r') as quant_fh:
+    with open(orig_file, 'r') as orig_fh, open('/home/zas1024/cfw_data/quant_norm.csv', 'r') as quant_fh:
         for i, (line1, line2) in enumerate(izip(orig_fh, quant_fh)):
             trait_dict = {}
             sample_list = []
@@ -118,7 +118,7 @@ if __name__ == '__main__':
 
     #out_filename = sys.argv[1][:-4] + '_quantnorm.txt'
 
-    #success, _ = bulk(es, set_data(sys.argv[1]))
+    success, _ = bulk(es, set_data(sys.argv[1]))
 
     response = es.search(
         index = "traits", doc_type = "trait", body = {
diff --git a/wqflask/wqflask/static/new/javascript/dataset_menu_structure.json b/wqflask/wqflask/static/new/javascript/dataset_menu_structure.json
index d00b52b8..c605329b 100644
--- a/wqflask/wqflask/static/new/javascript/dataset_menu_structure.json
+++ b/wqflask/wqflask/static/new/javascript/dataset_menu_structure.json
@@ -1966,6 +1966,21 @@
                   "470",
                   "EPFLADEL1013",
                   "EPFL/LISP BXD CD Brown Adipose Affy Mouse Gene 2.0 ST Exon Level (Oct13) RMA"
+               ],
+               [
+                  "777",
+                  "EL_BXDCDHFDScWAT_0216",
+                  "EPFL/LISP BXD CD+HFD Subcutaneous WAT Affy MTA 1.0 Gene Level (Feb16) RMA"
+               ],
+               [
+                  "778",
+                  "EL_BXDHFDScWAT_0216",
+                  "EPFL/LISP BXD HFD Subcutaneous WAT Affy MTA 1.0 Gene Level (Feb16) RMA"
+               ],
+               [
+                  "779",
+                  "EL_BXDCDScWAT_0216",
+                  "EPFL/LISP BXD CD Subcutaneous WAT Affy MTA 1.0 Gene Level (Feb16) RMA **"
                ]
             ],
             "Adrenal Gland mRNA": [
@@ -3355,6 +3370,15 @@
                ]
             ]
          },
+         "D2GM": {
+            "Retina mRNA": [
+               [
+                  "847",
+                  "JAX_D2GM_RSeq_log2Z_0418",
+                  "JAX Retina (Apr18) RNA-Seq log2-Z"
+               ]
+            ]
+         },
          "EMSR": {},
          "HS": {
             "Hippocampus mRNA": [
@@ -3954,6 +3978,10 @@
             "CXB",
             "CXB"
          ],
+         [
+            "D2GM",
+            "D2 Glaucoma Model"
+         ],
          [
             "EMSR",
             "Ethanol-Medicated Stress Reduction"
@@ -5236,6 +5264,12 @@
                "Spleen mRNA"
             ]
          ],
+         "D2GM": [
+            [
+               "Retina mRNA",
+               "Retina mRNA"
+            ]
+         ],
          "EMSR": [],
          "HS": [
             [
diff --git a/wqflask/wqflask/user_manager.py b/wqflask/wqflask/user_manager.py
index d652f2e9..132bae90 100644
--- a/wqflask/wqflask/user_manager.py
+++ b/wqflask/wqflask/user_manager.py
@@ -62,9 +62,17 @@ class AnonUser(object):
             self.anon_id, self.cookie = create_signed_cookie()
         self.key = "anon_collection:v1:{}".format(self.anon_id)
 
-        @after.after_this_request
+        #ZS: This was originally the commented out function below
+        #    For some reason I don't yet understand the commented out code works on production, 
+        #    but wouldn't set cookies for staging and my branch. The new code (using @app.after_request) seems to work.
+        @app.after_request
         def set_cookie(response):
             response.set_cookie(self.cookie_name, self.cookie)
+            return response
+
+        #@after.after_this_request
+        #def set_cookie(response):
+        #    response.set_cookie(self.cookie_name, self.cookie)
 
     def add_collection(self, new_collection):
         collection_dict = dict(name = new_collection.name,
-- 
cgit v1.2.3