diff options
Diffstat (limited to 'gn2/maintenance/quantile_normalize.py')
-rw-r--r-- | gn2/maintenance/quantile_normalize.py | 98 |
1 files changed, 98 insertions, 0 deletions
diff --git a/gn2/maintenance/quantile_normalize.py b/gn2/maintenance/quantile_normalize.py new file mode 100644 index 00000000..5620b552 --- /dev/null +++ b/gn2/maintenance/quantile_normalize.py @@ -0,0 +1,98 @@ +import sys +sys.path.insert(0, './') +import urllib.parse + +import numpy as np +import pandas as pd + +from flask import Flask, g, request + +from gn2.wqflask import app +from gn2.wqflask.database import database_connection +from gn2.utility.tools import get_setting + + +def create_dataframe(input_file): + with open(input_file) as f: + ncols = len(f.readline().split("\t")) + + input_array = np.loadtxt(open( + input_file, "rb"), delimiter="\t", skiprows=1, usecols=list(range(1, ncols))) + return pd.DataFrame(input_array) + +# This function taken from https://github.com/ShawnLYU/Quantile_Normalize + + +def quantileNormalize(df_input): + df = df_input.copy() + # compute rank + dic = {} + for col in df: + dic.update({col: sorted(df[col])}) + sorted_df = pd.DataFrame(dic) + rank = sorted_df.mean(axis=1).tolist() + # sort + for col in df: + t = np.searchsorted(np.sort(df[col]), df[col]) + df[col] = [rank[i] for i in t] + return df + + +def set_data(cursor, dataset_name): + orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt" + + sample_list = [] + with open(orig_file, 'r') as orig_fh, open('/home/zas1024/cfw_data/quant_norm.csv', 'r') as quant_fh: + for i, (line1, line2) in enumerate(zip(orig_fh, quant_fh)): + trait_dict = {} + sample_list = [] + if i == 0: + sample_names = line1.split('\t')[1:] + else: + trait_name = line1.split('\t')[0] + for i, sample in enumerate(sample_names): + this_sample = { + "name": sample, + "value": line1.split('\t')[i + 1], + "qnorm": line2.split('\t')[i + 1] + } + sample_list.append(this_sample) + query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName + FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet + WHERE Species.Id = InbredSet.SpeciesId and + InbredSet.Id = ProbeFreeze.InbredSetId and + ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId and + ProbeSetFreeze.Name = '%s' and + ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId and + ProbeSetXRef.ProbeSetId = ProbeSet.Id and + ProbeSet.Name = '%s'""" % (dataset_name, line1.split('\t')[0]) + cursor.execute(query) + result_info = cursor.fetchone() + + yield { + "_index": "traits", + "_type": "trait", + "_source": { + "name": trait_name, + "species": result_info[0], + "group": result_info[1], + "dataset": dataset_name, + "dataset_fullname": result_info[2], + "samples": sample_list, + "transform_types": "qnorm" + } + } + + +if __name__ == '__main__': + with database_connection(get_setting("SQL_URI")) as conn: + with conn.cursor() as cursor: + success, _ = bulk(es, set_data(cursor, sys.argv[1])) + + response = es.search( + index="traits", doc_type="trait", body={ + "query": {"match": {"name": "ENSMUSG00000028982"}} + } + ) + + print(response) |