aboutsummaryrefslogtreecommitdiff
path: root/wqflask/maintenance/quantile_normalize.py
diff options
context:
space:
mode:
authorzsloan2018-08-14 20:16:32 +0000
committerzsloan2018-08-14 20:16:32 +0000
commit838362c116b02c090dadeb76cda27e9902a6626a (patch)
treea6be104cc73e3bc9e271f9b5ca854dd32f3b810d /wqflask/maintenance/quantile_normalize.py
parent0bead53661ea701ffd9f9d565e4d2ecbbed81a8e (diff)
parent85defabb17ecdef1c7b8e92fa2e06b44d1e9ca49 (diff)
downloadgenenetwork2-838362c116b02c090dadeb76cda27e9902a6626a.tar.gz
Merge branch 'testing' of https://github.com/genenetwork/genenetwork2 into production
Diffstat (limited to 'wqflask/maintenance/quantile_normalize.py')
-rw-r--r--wqflask/maintenance/quantile_normalize.py129
1 files changed, 129 insertions, 0 deletions
diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py
new file mode 100644
index 00000000..41a3aad8
--- /dev/null
+++ b/wqflask/maintenance/quantile_normalize.py
@@ -0,0 +1,129 @@
+from __future__ import absolute_import, print_function, division
+
+import sys
+sys.path.insert(0,'./')
+
+from itertools import izip
+
+import MySQLdb
+import urlparse
+
+import numpy as np
+import pandas as pd
+from elasticsearch import Elasticsearch, TransportError
+from elasticsearch.helpers import bulk
+
+from flask import Flask, g, request
+
+from wqflask import app
+from utility.elasticsearch_tools import get_elasticsearch_connection
+from utility.tools import ELASTICSEARCH_HOST, ELASTICSEARCH_PORT, SQL_URI
+
+def parse_db_uri():
+ """Converts a database URI to the db name, host name, user name, and password"""
+
+ parsed_uri = urlparse.urlparse(SQL_URI)
+
+ db_conn_info = dict(
+ db = parsed_uri.path[1:],
+ host = parsed_uri.hostname,
+ user = parsed_uri.username,
+ passwd = parsed_uri.password)
+
+ print(db_conn_info)
+ return db_conn_info
+
+def create_dataframe(input_file):
+ with open(input_file) as f:
+ ncols = len(f.readline().split("\t"))
+
+ input_array = np.loadtxt(open(input_file, "rb"), delimiter="\t", skiprows=1, usecols=range(1, ncols))
+ return pd.DataFrame(input_array)
+
+#This function taken from https://github.com/ShawnLYU/Quantile_Normalize
+def quantileNormalize(df_input):
+ df = df_input.copy()
+ #compute rank
+ dic = {}
+ for col in df:
+ dic.update({col : sorted(df[col])})
+ sorted_df = pd.DataFrame(dic)
+ rank = sorted_df.mean(axis = 1).tolist()
+ #sort
+ for col in df:
+ t = np.searchsorted(np.sort(df[col]), df[col])
+ df[col] = [rank[i] for i in t]
+ return df
+
+def set_data(dataset_name):
+ orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt"
+
+ sample_list = []
+ with open(orig_file, 'r') as orig_fh, open('/home/zas1024/cfw_data/quant_norm.csv', 'r') as quant_fh:
+ for i, (line1, line2) in enumerate(izip(orig_fh, quant_fh)):
+ trait_dict = {}
+ sample_list = []
+ if i == 0:
+ sample_names = line1.split('\t')[1:]
+ else:
+ trait_name = line1.split('\t')[0]
+ for i, sample in enumerate(sample_names):
+ this_sample = {
+ "name": sample,
+ "value": line1.split('\t')[i+1],
+ "qnorm": line2.split('\t')[i+1]
+ }
+ sample_list.append(this_sample)
+ query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName
+ FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet
+ WHERE Species.Id = InbredSet.SpeciesId and
+ InbredSet.Id = ProbeFreeze.InbredSetId and
+ ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId and
+ ProbeSetFreeze.Name = '%s' and
+ ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId and
+ ProbeSetXRef.ProbeSetId = ProbeSet.Id and
+ ProbeSet.Name = '%s'""" % (dataset_name, line1.split('\t')[0])
+ Cursor.execute(query)
+ result_info = Cursor.fetchone()
+
+ yield {
+ "_index": "traits",
+ "_type": "trait",
+ "_source": {
+ "name": trait_name,
+ "species": result_info[0],
+ "group": result_info[1],
+ "dataset": dataset_name,
+ "dataset_fullname": result_info[2],
+ "samples": sample_list,
+ "transform_types": "qnorm"
+ }
+ }
+
+if __name__ == '__main__':
+ Conn = MySQLdb.Connect(**parse_db_uri())
+ Cursor = Conn.cursor()
+
+ #es = Elasticsearch([{
+ # "host": ELASTICSEARCH_HOST, "port": ELASTICSEARCH_PORT
+ #}], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None
+
+ es = get_elasticsearch_connection(for_user=False)
+
+ #input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt"
+ #input_df = create_dataframe(input_filename)
+ #output_df = quantileNormalize(input_df)
+
+ #output_df.to_csv('quant_norm.csv', sep='\t')
+
+ #out_filename = sys.argv[1][:-4] + '_quantnorm.txt'
+
+ success, _ = bulk(es, set_data(sys.argv[1]))
+
+ response = es.search(
+ index = "traits", doc_type = "trait", body = {
+ "query": { "match": { "name": "ENSMUSG00000028982" } }
+ }
+ )
+
+ print(response) \ No newline at end of file