diff options
Diffstat (limited to 'wqflask/maintenance/quantile_normalize.py')
-rw-r--r-- | wqflask/maintenance/quantile_normalize.py | 58 |
1 files changed, 30 insertions, 28 deletions
diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py index 41a3aad8..0cc963e5 100644 --- a/wqflask/maintenance/quantile_normalize.py +++ b/wqflask/maintenance/quantile_normalize.py @@ -1,12 +1,7 @@ -from __future__ import absolute_import, print_function, division - import sys -sys.path.insert(0,'./') - -from itertools import izip - +sys.path.insert(0, './') import MySQLdb -import urlparse +import urllib.parse import numpy as np import pandas as pd @@ -19,48 +14,54 @@ from wqflask import app from utility.elasticsearch_tools import get_elasticsearch_connection from utility.tools import ELASTICSEARCH_HOST, ELASTICSEARCH_PORT, SQL_URI + def parse_db_uri(): """Converts a database URI to the db name, host name, user name, and password""" - parsed_uri = urlparse.urlparse(SQL_URI) + parsed_uri = urllib.parse.urlparse(SQL_URI) db_conn_info = dict( - db = parsed_uri.path[1:], - host = parsed_uri.hostname, - user = parsed_uri.username, - passwd = parsed_uri.password) + db=parsed_uri.path[1:], + host=parsed_uri.hostname, + user=parsed_uri.username, + passwd=parsed_uri.password) print(db_conn_info) return db_conn_info + def create_dataframe(input_file): with open(input_file) as f: ncols = len(f.readline().split("\t")) - input_array = np.loadtxt(open(input_file, "rb"), delimiter="\t", skiprows=1, usecols=range(1, ncols)) + input_array = np.loadtxt(open( + input_file, "rb"), delimiter="\t", skiprows=1, usecols=list(range(1, ncols))) return pd.DataFrame(input_array) -#This function taken from https://github.com/ShawnLYU/Quantile_Normalize +# This function taken from https://github.com/ShawnLYU/Quantile_Normalize + + def quantileNormalize(df_input): df = df_input.copy() - #compute rank + # compute rank dic = {} for col in df: - dic.update({col : sorted(df[col])}) + dic.update({col: sorted(df[col])}) sorted_df = pd.DataFrame(dic) - rank = sorted_df.mean(axis = 1).tolist() - #sort + rank = sorted_df.mean(axis=1).tolist() + # sort for col in df: t = np.searchsorted(np.sort(df[col]), df[col]) df[col] = [rank[i] for i in t] return df + def set_data(dataset_name): orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt" sample_list = [] with open(orig_file, 'r') as orig_fh, open('/home/zas1024/cfw_data/quant_norm.csv', 'r') as quant_fh: - for i, (line1, line2) in enumerate(izip(orig_fh, quant_fh)): + for i, (line1, line2) in enumerate(zip(orig_fh, quant_fh)): trait_dict = {} sample_list = [] if i == 0: @@ -69,10 +70,10 @@ def set_data(dataset_name): trait_name = line1.split('\t')[0] for i, sample in enumerate(sample_names): this_sample = { - "name": sample, - "value": line1.split('\t')[i+1], - "qnorm": line2.split('\t')[i+1] - } + "name": sample, + "value": line1.split('\t')[i + 1], + "qnorm": line2.split('\t')[i + 1] + } sample_list.append(this_sample) query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet @@ -100,13 +101,14 @@ def set_data(dataset_name): } } + if __name__ == '__main__': Conn = MySQLdb.Connect(**parse_db_uri()) Cursor = Conn.cursor() - #es = Elasticsearch([{ + # es = Elasticsearch([{ # "host": ELASTICSEARCH_HOST, "port": ELASTICSEARCH_PORT - #}], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None + # }], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None es = get_elasticsearch_connection(for_user=False) @@ -121,9 +123,9 @@ if __name__ == '__main__': success, _ = bulk(es, set_data(sys.argv[1])) response = es.search( - index = "traits", doc_type = "trait", body = { - "query": { "match": { "name": "ENSMUSG00000028982" } } + index="traits", doc_type="trait", body={ + "query": {"match": {"name": "ENSMUSG00000028982"}} } ) - print(response)
\ No newline at end of file + print(response) |