diff options
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r-- | wqflask/maintenance/convert_dryad_to_bimbam.py | 70 | ||||
-rw-r--r-- | wqflask/maintenance/convert_geno_to_bimbam.py | 77 | ||||
-rw-r--r-- | wqflask/maintenance/gen_select_dataset.py | 9 | ||||
-rw-r--r-- | wqflask/maintenance/geno_to_json.py | 195 | ||||
-rw-r--r-- | wqflask/maintenance/get_group_samplelists.py | 10 | ||||
-rw-r--r-- | wqflask/maintenance/quantile_normalize.py | 129 |
6 files changed, 410 insertions, 80 deletions
diff --git a/wqflask/maintenance/convert_dryad_to_bimbam.py b/wqflask/maintenance/convert_dryad_to_bimbam.py new file mode 100644 index 00000000..e833b395 --- /dev/null +++ b/wqflask/maintenance/convert_dryad_to_bimbam.py @@ -0,0 +1,70 @@ +#!/usr/bin/python + +""" +Convert data dryad files to a BIMBAM _geno and _snps file + + +""" + +from __future__ import print_function, division, absolute_import +import sys +sys.path.append("..") + + +def read_dryad_file(filename): + exclude_count = 0 + marker_list = [] + sample_dict = {} + sample_list = [] + geno_rows = [] + with open(filename, 'r') as the_file: + for i, line in enumerate(the_file): + if i > 0: + if line.split(" ")[1] == "no": + sample_name = line.split(" ")[0] + sample_list.append(sample_name) + sample_dict[sample_name] = line.split(" ")[2:] + else: + exclude_count += 1 + else: + marker_list = line.split(" ")[2:] + + for i, marker in enumerate(marker_list): + this_row = [] + this_row.append(marker) + this_row.append("X") + this_row.append("Y") + for sample in sample_list: + this_row.append(sample_dict[sample][i]) + geno_rows.append(this_row) + + print(exclude_count) + + return geno_rows + + #for i, marker in enumerate(marker_list): + # this_row = [] + # this_row.append(marker) + # this_row.append("X") + # this_row.append("Y") + # with open(filename, 'r') as the_file: + # for j, line in enumerate(the_file): + # if j > 0: + # this_row.append(line.split(" ")[i+2]) + # print("row: " + str(i)) + # geno_rows.append(this_row) + # + #return geno_rows + +def write_bimbam_files(geno_rows): + with open('/home/zas1024/cfw_data/CFW_geno.txt', 'w') as geno_fh: + for row in geno_rows: + geno_fh.write(", ".join(row) + "\n") + +def convert_dryad_to_bimbam(filename): + geno_file_rows = read_dryad_file(filename) + write_bimbam_files(geno_file_rows) + +if __name__=="__main__": + input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt" + convert_dryad_to_bimbam(input_filename)
\ No newline at end of file diff --git a/wqflask/maintenance/convert_geno_to_bimbam.py b/wqflask/maintenance/convert_geno_to_bimbam.py index 05006d5c..45522705 100644 --- a/wqflask/maintenance/convert_geno_to_bimbam.py +++ b/wqflask/maintenance/convert_geno_to_bimbam.py @@ -17,17 +17,12 @@ import glob import traceback import gzip -#import numpy as np -#from pyLMM import lmm - import simplejson as json from pprint import pformat as pf class EmptyConfigurations(Exception): pass - - class Marker(object): def __init__(self): self.name = None @@ -39,47 +34,34 @@ class Marker(object): class ConvertGenoFile(object): def __init__(self, input_file, output_files): - self.input_file = input_file self.output_files = output_files - + self.mb_exists = False self.cm_exists = False self.markers = [] - + self.latest_row_pos = None self.latest_col_pos = None - + self.latest_row_value = None self.latest_col_value = None - - def convert(self): + def convert(self): self.haplotype_notation = { '@mat': "1", '@pat': "0", '@het': "0.5", '@unk': "NA" } - + self.configurations = {} - #self.skipped_cols = 3 - - #if self.input_file.endswith(".geno.gz"): - # print("self.input_file: ", self.input_file) - # self.input_fh = gzip.open(self.input_file) - #else: self.input_fh = open(self.input_file) - - with open(self.output_files[0], "w") as self.geno_fh: - #if self.file_type == "geno": - self.process_csv() - #elif self.file_type == "snps": - # self.process_snps_file() + self.process_csv() def process_csv(self): - for row_count, row in enumerate(self.process_rows()): + for row in self.process_rows(): row_items = row.split("\t") this_marker = Marker() @@ -102,53 +84,30 @@ class ConvertGenoFile(object): this_marker.genotypes.append(self.configurations[genotype.upper().strip()]) else: this_marker.genotypes.append("NA") - - #print("this_marker is:", pf(this_marker.__dict__)) - #if this_marker.chr == "14": + self.markers.append(this_marker.__dict__) self.write_to_bimbam() - - # with open(self.output_file, 'w') as fh: - # json.dump(self.markers, fh, indent=" ", sort_keys=True) - - # print('configurations:', str(configurations)) - #self.latest_col_pos = item_count + self.skipped_cols - #self.latest_col_value = item - - #if item_count != 0: - # self.output_fh.write(" ") - #self.output_fh.write(self.configurations[item.upper()]) - - #self.output_fh.write("\n") def write_to_bimbam(self): with open(self.output_files[0], "w") as geno_fh: - # geno_fh.write(str(len(self.sample_list)) + "\n") - # geno_fh.write("2\n") - # geno_fh.write("IND") - # for sample in self.sample_list: - # geno_fh.write(" " + sample) - # geno_fh.write("\n") for marker in self.markers: geno_fh.write(marker['name']) geno_fh.write(", X, Y") geno_fh.write(", " + ", ".join(marker['genotypes'])) geno_fh.write("\n") - - #pheno_fh = open(self.output_files[1], 'w') + with open(self.output_files[1], "w") as pheno_fh: for sample in self.sample_list: pheno_fh.write("1\n") - + with open(self.output_files[2], "w") as snp_fh: for marker in self.markers: if self.mb_exists: snp_fh.write(marker['name'] +", " + str(int(float(marker['Mb'])*1000000)) + ", " + marker['chr'] + "\n") else: snp_fh.write(marker['name'] +", " + str(int(float(marker['cM'])*1000000)) + ", " + marker['chr'] + "\n") - - + def get_sample_list(self, row_contents): self.sample_list = [] if self.mb_exists: @@ -164,8 +123,6 @@ class ConvertGenoFile(object): def process_rows(self): for self.latest_row_pos, row in enumerate(self.input_fh): - #if self.input_file.endswith(".geno.gz"): - # print("row: ", row) self.latest_row_value = row # Take care of headers if not row.strip(): @@ -208,10 +165,8 @@ class ConvertGenoFile(object): convertob.convert() except EmptyConfigurations as why: print(" No config info? Continuing...") - #excepted = True continue except Exception as why: - print(" Exception:", why) print(traceback.print_exc()) print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos, @@ -219,12 +174,6 @@ class ConvertGenoFile(object): print(" Column is:", convertob.latest_col_value) print(" Row is:", convertob.latest_row_value) break - - #def process_snps_file(cls, snps_file, new_directory): - # output_file = os.path.join(new_directory, "mouse_families.json") - # print("%s -> %s" % (snps_file, output_file)) - # convertob = ConvertGenoFile(input_file, output_file) - if __name__=="__main__": Old_Geno_Directory = """/home/zas1024/genotype_files/genotype/""" @@ -234,6 +183,4 @@ if __name__=="__main__": #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json") #convertob.convert() ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory) - #ConvertGenoFiles(Geno_Directory) - - #process_csv(Input_File, Output_File)
\ No newline at end of file + #ConvertGenoFiles(Geno_Directory)
\ No newline at end of file diff --git a/wqflask/maintenance/gen_select_dataset.py b/wqflask/maintenance/gen_select_dataset.py index 79242661..18b2dac9 100644 --- a/wqflask/maintenance/gen_select_dataset.py +++ b/wqflask/maintenance/gen_select_dataset.py @@ -63,7 +63,7 @@ from pprint import pformat as pf #conn = Engine.connect() -def parse_db_uri(db_uri): +def parse_db_uri(): """Converts a database URI to the db name, host name, user name, and password""" parsed_uri = urlparse.urlparse(SQL_URI) @@ -78,7 +78,6 @@ def parse_db_uri(db_uri): return db_conn_info - def get_species(): """Build species list""" Cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId") @@ -265,7 +264,7 @@ def build_datasets(species, group, type_name): def main(): """Generates and outputs (as json file) the data for the main dropdown menus on the home page""" - parse_db_uri(SQL_URI) + parse_db_uri() species = get_species() groups = get_groups(species) @@ -304,6 +303,6 @@ def _test_it(): #print("build_datasets:", pf(datasets)) if __name__ == '__main__': - Conn = MySQLdb.Connect(**parse_db_uri(SQL_URI)) + Conn = MySQLdb.Connect(**parse_db_uri()) Cursor = Conn.cursor() - main() + main()
\ No newline at end of file diff --git a/wqflask/maintenance/geno_to_json.py b/wqflask/maintenance/geno_to_json.py new file mode 100644 index 00000000..789a1691 --- /dev/null +++ b/wqflask/maintenance/geno_to_json.py @@ -0,0 +1,195 @@ +#!/usr/bin/python + +""" +Convert .geno files to json + +This file goes through all of the genofiles in the genofile directory (.geno) +and converts them to json files that are used when running the marker regression +code + +""" + +from __future__ import print_function, division, absolute_import +import sys +sys.path.append("..") +import os +import glob +import traceback +import gzip + +#import numpy as np +#from pyLMM import lmm + +import simplejson as json + +from pprint import pformat as pf + +class EmptyConfigurations(Exception): pass + + + +class Marker(object): + def __init__(self): + self.name = None + self.chr = None + self.cM = None + self.Mb = None + self.genotypes = [] + +class ConvertGenoFile(object): + + def __init__(self, input_file, output_file): + + self.input_file = input_file + self.output_file = output_file + + self.mb_exists = False + self.cm_exists = False + self.markers = [] + + self.latest_row_pos = None + self.latest_col_pos = None + + self.latest_row_value = None + self.latest_col_value = None + + def convert(self): + + self.haplotype_notation = { + '@mat': "1", + '@pat': "0", + '@het': "0.5", + '@unk': "NA" + } + + self.configurations = {} + #self.skipped_cols = 3 + + #if self.input_file.endswith(".geno.gz"): + # print("self.input_file: ", self.input_file) + # self.input_fh = gzip.open(self.input_file) + #else: + self.input_fh = open(self.input_file) + + with open(self.output_file, "w") as self.output_fh: + #if self.file_type == "geno": + self.process_csv() + #elif self.file_type == "snps": + # self.process_snps_file() + + + def process_csv(self): + for row_count, row in enumerate(self.process_rows()): + row_items = row.split("\t") + + this_marker = Marker() + this_marker.name = row_items[1] + this_marker.chr = row_items[0] + if self.cm_exists and self.mb_exists: + this_marker.cM = row_items[2] + this_marker.Mb = row_items[3] + genotypes = row_items[4:] + elif self.cm_exists: + this_marker.cM = row_items[2] + genotypes = row_items[3:] + elif self.mb_exists: + this_marker.Mb = row_items[2] + genotypes = row_items[3:] + else: + genotypes = row_items[2:] + for item_count, genotype in enumerate(genotypes): + if genotype.upper() in self.configurations: + this_marker.genotypes.append(self.configurations[genotype.upper()]) + else: + this_marker.genotypes.append("NA") + + #print("this_marker is:", pf(this_marker.__dict__)) + #if this_marker.chr == "14": + self.markers.append(this_marker.__dict__) + + with open(self.output_file, 'w') as fh: + json.dump(self.markers, fh, indent=" ", sort_keys=True) + + # print('configurations:', str(configurations)) + #self.latest_col_pos = item_count + self.skipped_cols + #self.latest_col_value = item + + #if item_count != 0: + # self.output_fh.write(" ") + #self.output_fh.write(self.configurations[item.upper()]) + + #self.output_fh.write("\n") + + + def process_rows(self): + for self.latest_row_pos, row in enumerate(self.input_fh): + #if self.input_file.endswith(".geno.gz"): + # print("row: ", row) + self.latest_row_value = row + # Take care of headers + if not row.strip(): + continue + if row.startswith('#'): + continue + if row.startswith('Chr'): + if 'Mb' in row.split(): + self.mb_exists = True + if 'cM' in row.split(): + self.cm_exists = True + continue + if row.startswith('@'): + key, _separater, value = row.partition(':') + key = key.strip() + value = value.strip() + if key in self.haplotype_notation: + self.configurations[value] = self.haplotype_notation[key] + continue + if not len(self.configurations): + raise EmptyConfigurations + yield row + + @classmethod + def process_all(cls, old_directory, new_directory): + os.chdir(old_directory) + for input_file in glob.glob("*"): + if not input_file.endswith(('geno', '.geno.gz')): + continue + group_name = ".".join(input_file.split('.')[:-1]) + output_file = os.path.join(new_directory, group_name + ".json") + print("%s -> %s" % ( + os.path.join(old_directory, input_file), output_file)) + convertob = ConvertGenoFile(input_file, output_file) + try: + convertob.convert() + except EmptyConfigurations as why: + print(" No config info? Continuing...") + #excepted = True + continue + except Exception as why: + + print(" Exception:", why) + print(traceback.print_exc()) + print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos, + convertob.latest_col_pos)) + print(" Column is:", convertob.latest_col_value) + print(" Row is:", convertob.latest_row_value) + break + + #def process_snps_file(cls, snps_file, new_directory): + # output_file = os.path.join(new_directory, "mouse_families.json") + # print("%s -> %s" % (snps_file, output_file)) + # convertob = ConvertGenoFile(input_file, output_file) + + + +if __name__=="__main__": + Old_Geno_Directory = """/home/zas1024/genotype_files/genotype/""" + New_Geno_Directory = """/home/zas1024/genotype_files/genotype/json/""" + #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno""" + #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps""" + #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json") + #convertob.convert() + ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory) + #ConvertGenoFiles(Geno_Directory) + + #process_csv(Input_File, Output_File)
\ No newline at end of file diff --git a/wqflask/maintenance/get_group_samplelists.py b/wqflask/maintenance/get_group_samplelists.py index 04e94886..1dc6c46c 100644 --- a/wqflask/maintenance/get_group_samplelists.py +++ b/wqflask/maintenance/get_group_samplelists.py @@ -6,16 +6,6 @@ import gzip from base import webqtlConfig -def process_genofiles(geno_dir=webqtlConfig.GENODIR): - print("Yabba") - #sys.exit("Dabba") - os.chdir(geno_dir) - for geno_file in glob.glob("*"): - if geno_file.lower().endswith(('.geno', '.geno.gz')): - #group_name = genofilename.split('.')[0] - sample_list = get_samplelist(geno_file) - - def get_samplelist(file_type, geno_file): if file_type == "geno": return get_samplelist_from_geno(geno_file) diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py new file mode 100644 index 00000000..41a3aad8 --- /dev/null +++ b/wqflask/maintenance/quantile_normalize.py @@ -0,0 +1,129 @@ +from __future__ import absolute_import, print_function, division + +import sys +sys.path.insert(0,'./') + +from itertools import izip + +import MySQLdb +import urlparse + +import numpy as np +import pandas as pd +from elasticsearch import Elasticsearch, TransportError +from elasticsearch.helpers import bulk + +from flask import Flask, g, request + +from wqflask import app +from utility.elasticsearch_tools import get_elasticsearch_connection +from utility.tools import ELASTICSEARCH_HOST, ELASTICSEARCH_PORT, SQL_URI + +def parse_db_uri(): + """Converts a database URI to the db name, host name, user name, and password""" + + parsed_uri = urlparse.urlparse(SQL_URI) + + db_conn_info = dict( + db = parsed_uri.path[1:], + host = parsed_uri.hostname, + user = parsed_uri.username, + passwd = parsed_uri.password) + + print(db_conn_info) + return db_conn_info + +def create_dataframe(input_file): + with open(input_file) as f: + ncols = len(f.readline().split("\t")) + + input_array = np.loadtxt(open(input_file, "rb"), delimiter="\t", skiprows=1, usecols=range(1, ncols)) + return pd.DataFrame(input_array) + +#This function taken from https://github.com/ShawnLYU/Quantile_Normalize +def quantileNormalize(df_input): + df = df_input.copy() + #compute rank + dic = {} + for col in df: + dic.update({col : sorted(df[col])}) + sorted_df = pd.DataFrame(dic) + rank = sorted_df.mean(axis = 1).tolist() + #sort + for col in df: + t = np.searchsorted(np.sort(df[col]), df[col]) + df[col] = [rank[i] for i in t] + return df + +def set_data(dataset_name): + orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt" + + sample_list = [] + with open(orig_file, 'r') as orig_fh, open('/home/zas1024/cfw_data/quant_norm.csv', 'r') as quant_fh: + for i, (line1, line2) in enumerate(izip(orig_fh, quant_fh)): + trait_dict = {} + sample_list = [] + if i == 0: + sample_names = line1.split('\t')[1:] + else: + trait_name = line1.split('\t')[0] + for i, sample in enumerate(sample_names): + this_sample = { + "name": sample, + "value": line1.split('\t')[i+1], + "qnorm": line2.split('\t')[i+1] + } + sample_list.append(this_sample) + query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName + FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet + WHERE Species.Id = InbredSet.SpeciesId and + InbredSet.Id = ProbeFreeze.InbredSetId and + ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId and + ProbeSetFreeze.Name = '%s' and + ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId and + ProbeSetXRef.ProbeSetId = ProbeSet.Id and + ProbeSet.Name = '%s'""" % (dataset_name, line1.split('\t')[0]) + Cursor.execute(query) + result_info = Cursor.fetchone() + + yield { + "_index": "traits", + "_type": "trait", + "_source": { + "name": trait_name, + "species": result_info[0], + "group": result_info[1], + "dataset": dataset_name, + "dataset_fullname": result_info[2], + "samples": sample_list, + "transform_types": "qnorm" + } + } + +if __name__ == '__main__': + Conn = MySQLdb.Connect(**parse_db_uri()) + Cursor = Conn.cursor() + + #es = Elasticsearch([{ + # "host": ELASTICSEARCH_HOST, "port": ELASTICSEARCH_PORT + #}], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None + + es = get_elasticsearch_connection(for_user=False) + + #input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt" + #input_df = create_dataframe(input_filename) + #output_df = quantileNormalize(input_df) + + #output_df.to_csv('quant_norm.csv', sep='\t') + + #out_filename = sys.argv[1][:-4] + '_quantnorm.txt' + + success, _ = bulk(es, set_data(sys.argv[1])) + + response = es.search( + index = "traits", doc_type = "trait", body = { + "query": { "match": { "name": "ENSMUSG00000028982" } } + } + ) + + print(response)
\ No newline at end of file |