diff options
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r-- | wqflask/maintenance/convert_dryad_to_bimbam.py | 70 | ||||
-rw-r--r-- | wqflask/maintenance/convert_geno_to_bimbam.py | 190 | ||||
-rw-r--r-- | wqflask/maintenance/gen_select_dataset.py | 99 | ||||
-rw-r--r-- | wqflask/maintenance/generate_kinship_from_bimbam.py | 61 | ||||
-rw-r--r-- | wqflask/maintenance/geno_to_json.py | 197 | ||||
-rw-r--r-- | wqflask/maintenance/get_group_samplelists.py | 12 | ||||
-rw-r--r-- | wqflask/maintenance/quantile_normalize.py | 129 |
7 files changed, 703 insertions, 55 deletions
diff --git a/wqflask/maintenance/convert_dryad_to_bimbam.py b/wqflask/maintenance/convert_dryad_to_bimbam.py new file mode 100644 index 00000000..e833b395 --- /dev/null +++ b/wqflask/maintenance/convert_dryad_to_bimbam.py @@ -0,0 +1,70 @@ +#!/usr/bin/python + +""" +Convert data dryad files to a BIMBAM _geno and _snps file + + +""" + +from __future__ import print_function, division, absolute_import +import sys +sys.path.append("..") + + +def read_dryad_file(filename): + exclude_count = 0 + marker_list = [] + sample_dict = {} + sample_list = [] + geno_rows = [] + with open(filename, 'r') as the_file: + for i, line in enumerate(the_file): + if i > 0: + if line.split(" ")[1] == "no": + sample_name = line.split(" ")[0] + sample_list.append(sample_name) + sample_dict[sample_name] = line.split(" ")[2:] + else: + exclude_count += 1 + else: + marker_list = line.split(" ")[2:] + + for i, marker in enumerate(marker_list): + this_row = [] + this_row.append(marker) + this_row.append("X") + this_row.append("Y") + for sample in sample_list: + this_row.append(sample_dict[sample][i]) + geno_rows.append(this_row) + + print(exclude_count) + + return geno_rows + + #for i, marker in enumerate(marker_list): + # this_row = [] + # this_row.append(marker) + # this_row.append("X") + # this_row.append("Y") + # with open(filename, 'r') as the_file: + # for j, line in enumerate(the_file): + # if j > 0: + # this_row.append(line.split(" ")[i+2]) + # print("row: " + str(i)) + # geno_rows.append(this_row) + # + #return geno_rows + +def write_bimbam_files(geno_rows): + with open('/home/zas1024/cfw_data/CFW_geno.txt', 'w') as geno_fh: + for row in geno_rows: + geno_fh.write(", ".join(row) + "\n") + +def convert_dryad_to_bimbam(filename): + geno_file_rows = read_dryad_file(filename) + write_bimbam_files(geno_file_rows) + +if __name__=="__main__": + input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt" + convert_dryad_to_bimbam(input_filename)
\ No newline at end of file diff --git a/wqflask/maintenance/convert_geno_to_bimbam.py b/wqflask/maintenance/convert_geno_to_bimbam.py new file mode 100644 index 00000000..528b98cf --- /dev/null +++ b/wqflask/maintenance/convert_geno_to_bimbam.py @@ -0,0 +1,190 @@ +#!/usr/bin/python + +""" +Convert .geno files to json + +This file goes through all of the genofiles in the genofile directory (.geno) +and converts them to json files that are used when running the marker regression +code + +""" + +from __future__ import print_function, division, absolute_import +import sys +sys.path.append("..") +import os +import glob +import traceback +import gzip + +import simplejson as json + +from pprint import pformat as pf + +class EmptyConfigurations(Exception): pass + +class Marker(object): + def __init__(self): + self.name = None + self.chr = None + self.cM = None + self.Mb = None + self.genotypes = [] + +class ConvertGenoFile(object): + + def __init__(self, input_file, output_files): + self.input_file = input_file + self.output_files = output_files + + self.mb_exists = False + self.cm_exists = False + self.markers = [] + + self.latest_row_pos = None + self.latest_col_pos = None + + self.latest_row_value = None + self.latest_col_value = None + + def convert(self): + self.haplotype_notation = { + '@mat': "1", + '@pat': "0", + '@het': "0.5", + '@unk': "NA" + } + + self.configurations = {} + self.input_fh = open(self.input_file) + + self.process_csv() + + def process_csv(self): + for row in self.process_rows(): + row_items = row.split("\t") + + this_marker = Marker() + this_marker.name = row_items[1] + this_marker.chr = row_items[0] + if self.cm_exists and self.mb_exists: + this_marker.cM = row_items[2] + this_marker.Mb = row_items[3] + genotypes = row_items[4:] + elif self.cm_exists: + this_marker.cM = row_items[2] + genotypes = row_items[3:] + elif self.mb_exists: + this_marker.Mb = row_items[2] + genotypes = row_items[3:] + else: + genotypes = row_items[2:] + for item_count, genotype in enumerate(genotypes): + if genotype.upper().strip() in self.configurations: + this_marker.genotypes.append(self.configurations[genotype.upper().strip()]) + else: + this_marker.genotypes.append("NA") + + self.markers.append(this_marker.__dict__) + + self.write_to_bimbam() + + def write_to_bimbam(self): + with open(self.output_files[0], "w") as geno_fh: + for marker in self.markers: + geno_fh.write(marker['name']) + geno_fh.write(", X, Y") + geno_fh.write(", " + ", ".join(marker['genotypes'])) + geno_fh.write("\n") + + with open(self.output_files[1], "w") as pheno_fh: + for sample in self.sample_list: + pheno_fh.write("1\n") + + with open(self.output_files[2], "w") as snp_fh: + for marker in self.markers: + if self.mb_exists: + snp_fh.write(marker['name'] +", " + str(int(float(marker['Mb'])*1000000)) + ", " + marker['chr'] + "\n") + else: + snp_fh.write(marker['name'] +", " + str(int(float(marker['cM'])*1000000)) + ", " + marker['chr'] + "\n") + + def get_sample_list(self, row_contents): + self.sample_list = [] + if self.mb_exists: + if self.cm_exists: + self.sample_list = row_contents[4:] + else: + self.sample_list = row_contents[3:] + else: + if self.cm_exists: + self.sample_list = row_contents[3:] + else: + self.sample_list = row_contents[2:] + + def process_rows(self): + for self.latest_row_pos, row in enumerate(self.input_fh): + self.latest_row_value = row + # Take care of headers + if not row.strip(): + continue + if row.startswith('#'): + continue + if row.startswith('Chr'): + if 'Mb' in row.split(): + self.mb_exists = True + if 'cM' in row.split(): + self.cm_exists = True + self.get_sample_list(row.split()) + continue + if row.startswith('@'): + key, _separater, value = row.partition(':') + key = key.strip() + value = value.strip() + if key == "@filler": + raise EmptyConfigurations + if key in self.haplotype_notation: + self.configurations[value] = self.haplotype_notation[key] + continue + if not len(self.configurations): + raise EmptyConfigurations + yield row + + @classmethod + def process_all(cls, old_directory, new_directory): + os.chdir(old_directory) + for input_file in glob.glob("*"): + if not input_file.endswith(('geno', '.geno.gz')): + continue + group_name = ".".join(input_file.split('.')[:-1]) + if group_name == "HSNIH-Palmer": + continue + geno_output_file = os.path.join(new_directory, group_name + "_geno.txt") + pheno_output_file = os.path.join(new_directory, group_name + "_pheno.txt") + snp_output_file = os.path.join(new_directory, group_name + "_snps.txt") + output_files = [geno_output_file, pheno_output_file, snp_output_file] + print("%s -> %s" % ( + os.path.join(old_directory, input_file), geno_output_file)) + convertob = ConvertGenoFile(input_file, output_files) + try: + convertob.convert() + except EmptyConfigurations as why: + print(" No config info? Continuing...") + continue + except Exception as why: + print(" Exception:", why) + print(traceback.print_exc()) + print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos, + convertob.latest_col_pos)) + print(" Column is:", convertob.latest_col_value) + print(" Row is:", convertob.latest_row_value) + break + +if __name__=="__main__": + Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype""" + New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/bimbam""" + #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno""" + #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps""" + #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json") + #convertob.convert() + ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory) + #ConvertGenoFiles(Geno_Directory)
\ No newline at end of file diff --git a/wqflask/maintenance/gen_select_dataset.py b/wqflask/maintenance/gen_select_dataset.py index e5726656..647e58a2 100644 --- a/wqflask/maintenance/gen_select_dataset.py +++ b/wqflask/maintenance/gen_select_dataset.py @@ -57,16 +57,13 @@ import urlparse from pprint import pformat as pf -#Engine = sa.create_engine(zach_settings.SQLALCHEMY_DATABASE_URI) +#Engine = sa.create_engine(zach_settings.SQL_URI) # build MySql database connection #conn = Engine.connect() -print('WARNING: This conversion is now OBSOLETE as the menu gets built from the database in Javascript using GN_SERVER instead!') - - -def parse_db_uri(db_uri): +def parse_db_uri(): """Converts a database URI to the db name, host name, user name, and password""" parsed_uri = urlparse.urlparse(SQL_URI) @@ -81,10 +78,10 @@ def parse_db_uri(db_uri): return db_conn_info - def get_species(): """Build species list""" - Cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId") + #Cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId") + Cursor.execute("select Name, MenuName from Species order by OrderId") species = list(Cursor.fetchall()) return species @@ -96,13 +93,14 @@ def get_groups(species): Cursor.execute("""select InbredSet.Name, InbredSet.FullName from InbredSet, Species, ProbeFreeze, GenoFreeze, PublishFreeze where Species.Name = '%s' - and InbredSet.SpeciesId = Species.Id and InbredSet.Name != 'BXD300' and + and InbredSet.SpeciesId = Species.Id and (PublishFreeze.InbredSetId = InbredSet.Id or GenoFreeze.InbredSetId = InbredSet.Id or ProbeFreeze.InbredSetId = InbredSet.Id) group by InbredSet.Name - order by InbredSet.Name""" % species_name) - groups[species_name] = list(Cursor.fetchall()) + order by InbredSet.FullName""" % species_name) + results = Cursor.fetchall() + groups[species_name] = list(results) return groups @@ -123,10 +121,20 @@ def get_types(groups): else: types[species][group_name] = [("Genotypes", "Genotypes")] if group_name in types[species]: - types[species][group_name] += build_types(species, group_name) - else: - types[species][group_name] = build_types(species, group_name) - + types_list = build_types(species, group_name) + if len(types_list) > 0: + types[species][group_name] += types_list + else: + if not phenotypes_exist(group_name) and not genotypes_exist(group_name): + types[species].pop(group_name, None) + groups[species] = tuple(group for group in groups[species] if group[0] != group_name) + else: #ZS: This whole else statement might be unnecessary, need to check + types_list = build_types(species, group_name) + if len(types_list) > 0: + types[species][group_name] = types_list + else: + types[species].pop(group_name, None) + groups[species] = tuple(group for group in groups[species] if group[0] != group_name) return types @@ -190,7 +198,6 @@ def get_datasets(types): for species, group_dict in types.iteritems(): datasets[species] = {} for group, type_list in group_dict.iteritems(): - #print("type_list: ", type_list) datasets[species][group] = {} for type_name in type_list: these_datasets = build_datasets(species, group, type_name[0]) @@ -203,26 +210,31 @@ def get_datasets(types): def build_datasets(species, group, type_name): """Gets dataset names from database""" dataset_text = dataset_value = None + datasets = [] if type_name == "Phenotypes": - print("GROUP:", group) - Cursor.execute("""select InfoFiles.GN_AccesionId from InfoFiles, PublishFreeze, InbredSet where + Cursor.execute("""select InfoFiles.GN_AccesionId, PublishFreeze.Name, PublishFreeze.FullName from InfoFiles, PublishFreeze, InbredSet where InbredSet.Name = '%s' and PublishFreeze.InbredSetId = InbredSet.Id and - InfoFiles.InfoPageName = PublishFreeze.Name and - PublishFreeze.public > 0 and - PublishFreeze.confidentiality < 1 order by - PublishFreeze.CreateTime desc""" % group) - - results = Cursor.fetchone() - if results != None: - dataset_id = str(results[0]) + InfoFiles.InfoPageName = PublishFreeze.Name order by + PublishFreeze.CreateTime asc""" % group) + + results = Cursor.fetchall() + if len(results) > 0: + for result in results: + print(result) + dataset_id = str(result[0]) + dataset_value = str(result[1]) + if group == 'MDP': + dataset_text = "Mouse Phenome Database" + else: + #dataset_text = "%s Phenotypes" % group + dataset_text = str(result[2]) + datasets.append((dataset_id, dataset_value, dataset_text)) else: dataset_id = "None" - dataset_value = "%sPublish" % group - if group == 'MDP': - dataset_text = "Mouse Phenome Database" - else: - dataset_text = "%s Published Phenotypes" % group + dataset_value = "%sPublish" % group + dataset_text = "%s Phenotypes" % group + datasets.append((dataset_id, dataset_value, dataset_text)) elif type_name == "Genotypes": Cursor.execute("""select InfoFiles.GN_AccesionId from InfoFiles, GenoFreeze, InbredSet where @@ -240,10 +252,9 @@ def build_datasets(species, group, type_name): dataset_id = "None" dataset_value = "%sGeno" % group dataset_text = "%s Genotypes" % group + datasets.append((dataset_id, dataset_value, dataset_text)) - if dataset_value: - return [(dataset_id, dataset_value, dataset_text)] - else: + else: # for mRNA expression/ProbeSet Cursor.execute("""select ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName from ProbeSetFreeze, ProbeFreeze, InbredSet, Tissue, Species where Species.Name = '%s' and Species.Id = InbredSet.SpeciesId and @@ -261,26 +272,26 @@ def build_datasets(species, group, type_name): this_dataset_info.append(str(info)) datasets.append(this_dataset_info) - return datasets + return datasets def main(): """Generates and outputs (as json file) the data for the main dropdown menus on the home page""" - parse_db_uri(SQL_URI) + parse_db_uri() species = get_species() groups = get_groups(species) types = get_types(groups) datasets = get_datasets(types) - species.append(('All Species', 'All Species')) - groups['All Species'] = [('All Groups', 'All Groups')] - types['All Species'] = {} - types['All Species']['All Groups'] = [('Phenotypes', 'Phenotypes')] - datasets['All Species'] = {} - datasets['All Species']['All Groups'] = {} - datasets['All Species']['All Groups']['Phenotypes'] = [('All Phenotypes','All Phenotypes')] + #species.append(('All Species', 'All Species')) + #groups['All Species'] = [('All Groups', 'All Groups')] + #types['All Species'] = {} + #types['All Species']['All Groups'] = [('Phenotypes', 'Phenotypes')] + #datasets['All Species'] = {} + #datasets['All Species']['All Groups'] = {} + #datasets['All Species']['All Groups']['Phenotypes'] = [('All Phenotypes','All Phenotypes')] data = dict(species=species, groups=groups, @@ -306,6 +317,6 @@ def _test_it(): #print("build_datasets:", pf(datasets)) if __name__ == '__main__': - Conn = MySQLdb.Connect(**parse_db_uri(SQL_URI)) + Conn = MySQLdb.Connect(**parse_db_uri()) Cursor = Conn.cursor() - main() + main()
\ No newline at end of file diff --git a/wqflask/maintenance/generate_kinship_from_bimbam.py b/wqflask/maintenance/generate_kinship_from_bimbam.py new file mode 100644 index 00000000..b53f5dda --- /dev/null +++ b/wqflask/maintenance/generate_kinship_from_bimbam.py @@ -0,0 +1,61 @@ +#!/usr/bin/python + +""" +Generate relatedness matrix files for GEMMA from BIMBAM genotype/phenotype files + +This file goes through all of the BIMBAM files in the bimbam diretory +and uses GEMMA to generate their corresponding kinship/relatedness matrix file + +""" + +from __future__ import print_function, division, absolute_import +import sys +sys.path.append("..") +import os +import glob + +class GenerateKinshipMatrices(object): + def __init__(self, group_name, geno_file, pheno_file): + self.group_name = group_name + self.geno_file = geno_file + self.pheno_file = pheno_file + + def generate_kinship(self): + gemma_command = "/gnu/store/xhzgjr0jvakxv6h3blj8z496xjig69b0-profile/bin/gemma -g " + self.geno_file + " -p " + self.pheno_file + " -gk 1 -outdir /home/zas1024/genotype_files/genotype/bimbam/ -o " + self.group_name + print("command:", gemma_command) + os.system(gemma_command) + + @classmethod + def process_all(self, geno_dir, bimbam_dir): + os.chdir(geno_dir) + for input_file in glob.glob("*"): + if not input_file.endswith(('geno', '.geno.gz')): + continue + group_name = ".".join(input_file.split('.')[:-1]) + if group_name == "HSNIH-Palmer": + continue + geno_input_file = os.path.join(bimbam_dir, group_name + "_geno.txt") + pheno_input_file = os.path.join(bimbam_dir, group_name + "_pheno.txt") + convertob = GenerateKinshipMatrices(group_name, geno_input_file, pheno_input_file) + try: + convertob.generate_kinship() + except EmptyConfigurations as why: + print(" No config info? Continuing...") + continue + except Exception as why: + + print(" Exception:", why) + print(traceback.print_exc()) + print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos, + convertob.latest_col_pos)) + print(" Column is:", convertob.latest_col_value) + print(" Row is:", convertob.latest_row_value) + break + + +if __name__=="__main__": + Geno_Directory = """/export/local/home/zas1024/genotype_files/genotype/""" + Bimbam_Directory = """/export/local/home/zas1024/genotype_files/genotype/bimbam/""" + GenerateKinshipMatrices.process_all(Geno_Directory, Bimbam_Directory) + + #./gemma -g /home/zas1024/genotype_files/genotype/bimbam/BXD_geno.txt -p /home/zas1024/genotype_files/genotype/bimbam/BXD_pheno.txt -gk 1 -o BXD
\ No newline at end of file diff --git a/wqflask/maintenance/geno_to_json.py b/wqflask/maintenance/geno_to_json.py new file mode 100644 index 00000000..9579812a --- /dev/null +++ b/wqflask/maintenance/geno_to_json.py @@ -0,0 +1,197 @@ +#!/usr/bin/python + +""" +Convert .geno files to json + +This file goes through all of the genofiles in the genofile directory (.geno) +and converts them to json files that are used when running the marker regression +code + +""" + +from __future__ import print_function, division, absolute_import +import sys +sys.path.append("..") +import os +import glob +import traceback +import gzip + +#import numpy as np +#from pyLMM import lmm + +import simplejson as json + +from pprint import pformat as pf + +#from utility.tools import flat_files + +class EmptyConfigurations(Exception): pass + + + +class Marker(object): + def __init__(self): + self.name = None + self.chr = None + self.cM = None + self.Mb = None + self.genotypes = [] + +class ConvertGenoFile(object): + + def __init__(self, input_file, output_file): + + self.input_file = input_file + self.output_file = output_file + + self.mb_exists = False + self.cm_exists = False + self.markers = [] + + self.latest_row_pos = None + self.latest_col_pos = None + + self.latest_row_value = None + self.latest_col_value = None + + def convert(self): + + self.haplotype_notation = { + '@mat': "1", + '@pat': "0", + '@het': "0.5", + '@unk': "NA" + } + + self.configurations = {} + #self.skipped_cols = 3 + + #if self.input_file.endswith(".geno.gz"): + # print("self.input_file: ", self.input_file) + # self.input_fh = gzip.open(self.input_file) + #else: + self.input_fh = open(self.input_file) + + with open(self.output_file, "w") as self.output_fh: + #if self.file_type == "geno": + self.process_csv() + #elif self.file_type == "snps": + # self.process_snps_file() + + + def process_csv(self): + for row_count, row in enumerate(self.process_rows()): + row_items = row.split("\t") + + this_marker = Marker() + this_marker.name = row_items[1] + this_marker.chr = row_items[0] + if self.cm_exists and self.mb_exists: + this_marker.cM = row_items[2] + this_marker.Mb = row_items[3] + genotypes = row_items[4:] + elif self.cm_exists: + this_marker.cM = row_items[2] + genotypes = row_items[3:] + elif self.mb_exists: + this_marker.Mb = row_items[2] + genotypes = row_items[3:] + else: + genotypes = row_items[2:] + for item_count, genotype in enumerate(genotypes): + if genotype.upper() in self.configurations: + this_marker.genotypes.append(self.configurations[genotype.upper()]) + else: + this_marker.genotypes.append("NA") + + #print("this_marker is:", pf(this_marker.__dict__)) + #if this_marker.chr == "14": + self.markers.append(this_marker.__dict__) + + with open(self.output_file, 'w') as fh: + json.dump(self.markers, fh, indent=" ", sort_keys=True) + + # print('configurations:', str(configurations)) + #self.latest_col_pos = item_count + self.skipped_cols + #self.latest_col_value = item + + #if item_count != 0: + # self.output_fh.write(" ") + #self.output_fh.write(self.configurations[item.upper()]) + + #self.output_fh.write("\n") + + + def process_rows(self): + for self.latest_row_pos, row in enumerate(self.input_fh): + #if self.input_file.endswith(".geno.gz"): + # print("row: ", row) + self.latest_row_value = row + # Take care of headers + if not row.strip(): + continue + if row.startswith('#'): + continue + if row.startswith('Chr'): + if 'Mb' in row.split(): + self.mb_exists = True + if 'cM' in row.split(): + self.cm_exists = True + continue + if row.startswith('@'): + key, _separater, value = row.partition(':') + key = key.strip() + value = value.strip() + if key in self.haplotype_notation: + self.configurations[value] = self.haplotype_notation[key] + continue + if not len(self.configurations): + raise EmptyConfigurations + yield row + + @classmethod + def process_all(cls, old_directory, new_directory): + os.chdir(old_directory) + for input_file in glob.glob("*"): + if not input_file.endswith(('geno', '.geno.gz')): + continue + group_name = ".".join(input_file.split('.')[:-1]) + output_file = os.path.join(new_directory, group_name + ".json") + print("%s -> %s" % ( + os.path.join(old_directory, input_file), output_file)) + convertob = ConvertGenoFile(input_file, output_file) + try: + convertob.convert() + except EmptyConfigurations as why: + print(" No config info? Continuing...") + #excepted = True + continue + except Exception as why: + + print(" Exception:", why) + print(traceback.print_exc()) + print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos, + convertob.latest_col_pos)) + print(" Column is:", convertob.latest_col_value) + print(" Row is:", convertob.latest_row_value) + break + + #def process_snps_file(cls, snps_file, new_directory): + # output_file = os.path.join(new_directory, "mouse_families.json") + # print("%s -> %s" % (snps_file, output_file)) + # convertob = ConvertGenoFile(input_file, output_file) + + + +if __name__=="__main__": + Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype""" + New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/json""" + #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno""" + #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps""" + #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json") + #convertob.convert() + ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory) + #ConvertGenoFiles(Geno_Directory) + + #process_csv(Input_File, Output_File)
\ No newline at end of file diff --git a/wqflask/maintenance/get_group_samplelists.py b/wqflask/maintenance/get_group_samplelists.py index 04e94886..fb22898a 100644 --- a/wqflask/maintenance/get_group_samplelists.py +++ b/wqflask/maintenance/get_group_samplelists.py @@ -6,16 +6,6 @@ import gzip from base import webqtlConfig -def process_genofiles(geno_dir=webqtlConfig.GENODIR): - print("Yabba") - #sys.exit("Dabba") - os.chdir(geno_dir) - for geno_file in glob.glob("*"): - if geno_file.lower().endswith(('.geno', '.geno.gz')): - #group_name = genofilename.split('.')[0] - sample_list = get_samplelist(geno_file) - - def get_samplelist(file_type, geno_file): if file_type == "geno": return get_samplelist_from_geno(geno_file) @@ -37,7 +27,7 @@ def get_samplelist_from_geno(genofilename): continue break - headers = line.split() + headers = line.split("\t") if headers[3] == "Mb": samplelist = headers[4:] diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py new file mode 100644 index 00000000..41a3aad8 --- /dev/null +++ b/wqflask/maintenance/quantile_normalize.py @@ -0,0 +1,129 @@ +from __future__ import absolute_import, print_function, division + +import sys +sys.path.insert(0,'./') + +from itertools import izip + +import MySQLdb +import urlparse + +import numpy as np +import pandas as pd +from elasticsearch import Elasticsearch, TransportError +from elasticsearch.helpers import bulk + +from flask import Flask, g, request + +from wqflask import app +from utility.elasticsearch_tools import get_elasticsearch_connection +from utility.tools import ELASTICSEARCH_HOST, ELASTICSEARCH_PORT, SQL_URI + +def parse_db_uri(): + """Converts a database URI to the db name, host name, user name, and password""" + + parsed_uri = urlparse.urlparse(SQL_URI) + + db_conn_info = dict( + db = parsed_uri.path[1:], + host = parsed_uri.hostname, + user = parsed_uri.username, + passwd = parsed_uri.password) + + print(db_conn_info) + return db_conn_info + +def create_dataframe(input_file): + with open(input_file) as f: + ncols = len(f.readline().split("\t")) + + input_array = np.loadtxt(open(input_file, "rb"), delimiter="\t", skiprows=1, usecols=range(1, ncols)) + return pd.DataFrame(input_array) + +#This function taken from https://github.com/ShawnLYU/Quantile_Normalize +def quantileNormalize(df_input): + df = df_input.copy() + #compute rank + dic = {} + for col in df: + dic.update({col : sorted(df[col])}) + sorted_df = pd.DataFrame(dic) + rank = sorted_df.mean(axis = 1).tolist() + #sort + for col in df: + t = np.searchsorted(np.sort(df[col]), df[col]) + df[col] = [rank[i] for i in t] + return df + +def set_data(dataset_name): + orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt" + + sample_list = [] + with open(orig_file, 'r') as orig_fh, open('/home/zas1024/cfw_data/quant_norm.csv', 'r') as quant_fh: + for i, (line1, line2) in enumerate(izip(orig_fh, quant_fh)): + trait_dict = {} + sample_list = [] + if i == 0: + sample_names = line1.split('\t')[1:] + else: + trait_name = line1.split('\t')[0] + for i, sample in enumerate(sample_names): + this_sample = { + "name": sample, + "value": line1.split('\t')[i+1], + "qnorm": line2.split('\t')[i+1] + } + sample_list.append(this_sample) + query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName + FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet + WHERE Species.Id = InbredSet.SpeciesId and + InbredSet.Id = ProbeFreeze.InbredSetId and + ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId and + ProbeSetFreeze.Name = '%s' and + ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId and + ProbeSetXRef.ProbeSetId = ProbeSet.Id and + ProbeSet.Name = '%s'""" % (dataset_name, line1.split('\t')[0]) + Cursor.execute(query) + result_info = Cursor.fetchone() + + yield { + "_index": "traits", + "_type": "trait", + "_source": { + "name": trait_name, + "species": result_info[0], + "group": result_info[1], + "dataset": dataset_name, + "dataset_fullname": result_info[2], + "samples": sample_list, + "transform_types": "qnorm" + } + } + +if __name__ == '__main__': + Conn = MySQLdb.Connect(**parse_db_uri()) + Cursor = Conn.cursor() + + #es = Elasticsearch([{ + # "host": ELASTICSEARCH_HOST, "port": ELASTICSEARCH_PORT + #}], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None + + es = get_elasticsearch_connection(for_user=False) + + #input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt" + #input_df = create_dataframe(input_filename) + #output_df = quantileNormalize(input_df) + + #output_df.to_csv('quant_norm.csv', sep='\t') + + #out_filename = sys.argv[1][:-4] + '_quantnorm.txt' + + success, _ = bulk(es, set_data(sys.argv[1])) + + response = es.search( + index = "traits", doc_type = "trait", body = { + "query": { "match": { "name": "ENSMUSG00000028982" } } + } + ) + + print(response)
\ No newline at end of file |