diff options
author | zsloan | 2021-10-18 17:50:26 +0000 |
---|---|---|
committer | zsloan | 2021-10-18 17:50:26 +0000 |
commit | e36eaf0003a598bc5aa688803dd1b36c24a4c051 (patch) | |
tree | a59b7dadf02241575eb0774f97c6048e2425c053 /wqflask/maintenance | |
parent | bd421438f1f0b4de913fa40cd49cfcda27e6b16f (diff) | |
parent | 04f3d13aceeaec2e52b94037d59f08ed6dc6a8bb (diff) | |
download | genenetwork2-e36eaf0003a598bc5aa688803dd1b36c24a4c051.tar.gz |
Merge branch 'testing' of github.com:genenetwork/genenetwork2 into feature/remove_trait_creation_from_search
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r-- | wqflask/maintenance/convert_dryad_to_bimbam.py | 11 | ||||
-rw-r--r-- | wqflask/maintenance/convert_geno_to_bimbam.py | 46 | ||||
-rw-r--r-- | wqflask/maintenance/gen_select_dataset.py | 27 | ||||
-rw-r--r-- | wqflask/maintenance/generate_kinship_from_bimbam.py | 30 | ||||
-rw-r--r-- | wqflask/maintenance/generate_probesetfreeze_file.py | 14 | ||||
-rw-r--r-- | wqflask/maintenance/geno_to_json.py | 82 | ||||
-rw-r--r-- | wqflask/maintenance/get_group_samplelists.py | 3 | ||||
-rw-r--r-- | wqflask/maintenance/print_benchmark.py | 6 | ||||
-rw-r--r-- | wqflask/maintenance/quantile_normalize.py | 43 | ||||
-rw-r--r-- | wqflask/maintenance/set_resource_defaults.py | 53 |
10 files changed, 186 insertions, 129 deletions
diff --git a/wqflask/maintenance/convert_dryad_to_bimbam.py b/wqflask/maintenance/convert_dryad_to_bimbam.py index 12ce35e9..18fbb8a1 100644 --- a/wqflask/maintenance/convert_dryad_to_bimbam.py +++ b/wqflask/maintenance/convert_dryad_to_bimbam.py @@ -41,7 +41,7 @@ def read_dryad_file(filename): return geno_rows - #for i, marker in enumerate(marker_list): + # for i, marker in enumerate(marker_list): # this_row = [] # this_row.append(marker) # this_row.append("X") @@ -52,18 +52,21 @@ def read_dryad_file(filename): # this_row.append(line.split(" ")[i+2]) # print("row: " + str(i)) # geno_rows.append(this_row) - # - #return geno_rows + # + # return geno_rows + def write_bimbam_files(geno_rows): with open('/home/zas1024/cfw_data/CFW_geno.txt', 'w') as geno_fh: for row in geno_rows: geno_fh.write(", ".join(row) + "\n") + def convert_dryad_to_bimbam(filename): geno_file_rows = read_dryad_file(filename) write_bimbam_files(geno_file_rows) -if __name__=="__main__": + +if __name__ == "__main__": input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt" convert_dryad_to_bimbam(input_filename) diff --git a/wqflask/maintenance/convert_geno_to_bimbam.py b/wqflask/maintenance/convert_geno_to_bimbam.py index d49742f2..078be529 100644 --- a/wqflask/maintenance/convert_geno_to_bimbam.py +++ b/wqflask/maintenance/convert_geno_to_bimbam.py @@ -20,9 +20,12 @@ import simplejson as json from pprint import pformat as pf -class EmptyConfigurations(Exception): pass -class Marker(object): +class EmptyConfigurations(Exception): + pass + + +class Marker: def __init__(self): self.name = None self.chr = None @@ -30,7 +33,8 @@ class Marker(object): self.Mb = None self.genotypes = [] -class ConvertGenoFile(object): + +class ConvertGenoFile: def __init__(self, input_file, output_files): self.input_file = input_file @@ -52,7 +56,7 @@ class ConvertGenoFile(object): '@pat': "0", '@het': "0.5", '@unk': "NA" - } + } self.configurations = {} self.input_fh = open(self.input_file) @@ -80,13 +84,14 @@ class ConvertGenoFile(object): genotypes = row_items[2:] for item_count, genotype in enumerate(genotypes): if genotype.upper().strip() in self.configurations: - this_marker.genotypes.append(self.configurations[genotype.upper().strip()]) + this_marker.genotypes.append( + self.configurations[genotype.upper().strip()]) else: this_marker.genotypes.append("NA") self.markers.append(this_marker.__dict__) - self.write_to_bimbam() + self.write_to_bimbam() def write_to_bimbam(self): with open(self.output_files[0], "w") as geno_fh: @@ -103,9 +108,11 @@ class ConvertGenoFile(object): with open(self.output_files[2], "w") as snp_fh: for marker in self.markers: if self.mb_exists: - snp_fh.write(marker['name'] +", " + str(int(float(marker['Mb'])*1000000)) + ", " + marker['chr'] + "\n") + snp_fh.write( + marker['name'] + ", " + str(int(float(marker['Mb']) * 1000000)) + ", " + marker['chr'] + "\n") else: - snp_fh.write(marker['name'] +", " + str(int(float(marker['cM'])*1000000)) + ", " + marker['chr'] + "\n") + snp_fh.write( + marker['name'] + ", " + str(int(float(marker['cM']) * 1000000)) + ", " + marker['chr'] + "\n") def get_sample_list(self, row_contents): self.sample_list = [] @@ -119,7 +126,7 @@ class ConvertGenoFile(object): self.sample_list = row_contents[3:] else: self.sample_list = row_contents[2:] - + def process_rows(self): for self.latest_row_pos, row in enumerate(self.input_fh): self.latest_row_value = row @@ -157,10 +164,14 @@ class ConvertGenoFile(object): group_name = ".".join(input_file.split('.')[:-1]) if group_name == "HSNIH-Palmer": continue - geno_output_file = os.path.join(new_directory, group_name + "_geno.txt") - pheno_output_file = os.path.join(new_directory, group_name + "_pheno.txt") - snp_output_file = os.path.join(new_directory, group_name + "_snps.txt") - output_files = [geno_output_file, pheno_output_file, snp_output_file] + geno_output_file = os.path.join( + new_directory, group_name + "_geno.txt") + pheno_output_file = os.path.join( + new_directory, group_name + "_pheno.txt") + snp_output_file = os.path.join( + new_directory, group_name + "_snps.txt") + output_files = [geno_output_file, + pheno_output_file, snp_output_file] print("%s -> %s" % ( os.path.join(old_directory, input_file), geno_output_file)) convertob = ConvertGenoFile(input_file, output_files) @@ -173,17 +184,18 @@ class ConvertGenoFile(object): print(" Exception:", why) print(traceback.print_exc()) print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos, - convertob.latest_col_pos)) + convertob.latest_col_pos)) print(" Column is:", convertob.latest_col_value) print(" Row is:", convertob.latest_row_value) break -if __name__=="__main__": + +if __name__ == "__main__": Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype""" New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/bimbam""" #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno""" #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps""" #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json") - #convertob.convert() + # convertob.convert() ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory) - #ConvertGenoFiles(Geno_Directory) + # ConvertGenoFiles(Geno_Directory) diff --git a/wqflask/maintenance/gen_select_dataset.py b/wqflask/maintenance/gen_select_dataset.py index 544e2fd1..db65a11f 100644 --- a/wqflask/maintenance/gen_select_dataset.py +++ b/wqflask/maintenance/gen_select_dataset.py @@ -55,16 +55,17 @@ from pprint import pformat as pf #conn = Engine.connect() + def parse_db_uri(): """Converts a database URI to the db name, host name, user name, and password""" parsed_uri = urllib.parse.urlparse(SQL_URI) db_conn_info = dict( - db = parsed_uri.path[1:], - host = parsed_uri.hostname, - user = parsed_uri.username, - passwd = parsed_uri.password) + db=parsed_uri.path[1:], + host=parsed_uri.hostname, + user=parsed_uri.username, + passwd=parsed_uri.password) print(db_conn_info) return db_conn_info @@ -119,21 +120,23 @@ def get_types(groups): else: if not phenotypes_exist(group_name) and not genotypes_exist(group_name): types[species].pop(group_name, None) - groups[species] = tuple(group for group in groups[species] if group[0] != group_name) - else: #ZS: This whole else statement might be unnecessary, need to check + groups[species] = tuple( + group for group in groups[species] if group[0] != group_name) + else: # ZS: This whole else statement might be unnecessary, need to check types_list = build_types(species, group_name) if len(types_list) > 0: types[species][group_name] = types_list else: types[species].pop(group_name, None) - groups[species] = tuple(group for group in groups[species] if group[0] != group_name) + groups[species] = tuple( + group for group in groups[species] if group[0] != group_name) return types def phenotypes_exist(group_name): #print("group_name:", group_name) Cursor.execute("""select Name from PublishFreeze - where PublishFreeze.Name = '%s'""" % (group_name+"Publish")) + where PublishFreeze.Name = '%s'""" % (group_name + "Publish")) results = Cursor.fetchone() #print("RESULTS:", results) @@ -143,10 +146,11 @@ def phenotypes_exist(group_name): else: return False + def genotypes_exist(group_name): #print("group_name:", group_name) Cursor.execute("""select Name from GenoFreeze - where GenoFreeze.Name = '%s'""" % (group_name+"Geno")) + where GenoFreeze.Name = '%s'""" % (group_name + "Geno")) results = Cursor.fetchone() #print("RESULTS:", results) @@ -156,6 +160,7 @@ def genotypes_exist(group_name): else: return False + def build_types(species, group): """Fetches tissues @@ -184,6 +189,7 @@ def build_types(species, group): return results + def get_datasets(types): """Build datasets list""" datasets = {} @@ -246,7 +252,7 @@ def build_datasets(species, group, type_name): dataset_text = "%s Genotypes" % group datasets.append((dataset_id, dataset_value, dataset_text)) - else: # for mRNA expression/ProbeSet + else: # for mRNA expression/ProbeSet Cursor.execute("""select ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName from ProbeSetFreeze, ProbeFreeze, InbredSet, Tissue, Species where Species.Name = '%s' and Species.Id = InbredSet.SpeciesId and @@ -308,6 +314,7 @@ def _test_it(): datasets = build_datasets("Mouse", "BXD", "Hippocampus") #print("build_datasets:", pf(datasets)) + if __name__ == '__main__': Conn = MySQLdb.Connect(**parse_db_uri()) Cursor = Conn.cursor() diff --git a/wqflask/maintenance/generate_kinship_from_bimbam.py b/wqflask/maintenance/generate_kinship_from_bimbam.py index 60257b28..9f01d094 100644 --- a/wqflask/maintenance/generate_kinship_from_bimbam.py +++ b/wqflask/maintenance/generate_kinship_from_bimbam.py @@ -13,14 +13,17 @@ sys.path.append("..") import os import glob -class GenerateKinshipMatrices(object): + +class GenerateKinshipMatrices: def __init__(self, group_name, geno_file, pheno_file): self.group_name = group_name self.geno_file = geno_file self.pheno_file = pheno_file - + def generate_kinship(self): - gemma_command = "/gnu/store/xhzgjr0jvakxv6h3blj8z496xjig69b0-profile/bin/gemma -g " + self.geno_file + " -p " + self.pheno_file + " -gk 1 -outdir /home/zas1024/genotype_files/genotype/bimbam/ -o " + self.group_name + gemma_command = "/gnu/store/xhzgjr0jvakxv6h3blj8z496xjig69b0-profile/bin/gemma -g " + self.geno_file + \ + " -p " + self.pheno_file + \ + " -gk 1 -outdir /home/zas1024/genotype_files/genotype/bimbam/ -o " + self.group_name print("command:", gemma_command) os.system(gemma_command) @@ -33,9 +36,12 @@ class GenerateKinshipMatrices(object): group_name = ".".join(input_file.split('.')[:-1]) if group_name == "HSNIH-Palmer": continue - geno_input_file = os.path.join(bimbam_dir, group_name + "_geno.txt") - pheno_input_file = os.path.join(bimbam_dir, group_name + "_pheno.txt") - convertob = GenerateKinshipMatrices(group_name, geno_input_file, pheno_input_file) + geno_input_file = os.path.join( + bimbam_dir, group_name + "_geno.txt") + pheno_input_file = os.path.join( + bimbam_dir, group_name + "_pheno.txt") + convertob = GenerateKinshipMatrices( + group_name, geno_input_file, pheno_input_file) try: convertob.generate_kinship() except EmptyConfigurations as why: @@ -46,15 +52,15 @@ class GenerateKinshipMatrices(object): print(" Exception:", why) print(traceback.print_exc()) print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos, - convertob.latest_col_pos)) + convertob.latest_col_pos)) print(" Column is:", convertob.latest_col_value) print(" Row is:", convertob.latest_row_value) break - - -if __name__=="__main__": + + +if __name__ == "__main__": Geno_Directory = """/export/local/home/zas1024/genotype_files/genotype/""" Bimbam_Directory = """/export/local/home/zas1024/genotype_files/genotype/bimbam/""" GenerateKinshipMatrices.process_all(Geno_Directory, Bimbam_Directory) - - #./gemma -g /home/zas1024/genotype_files/genotype/bimbam/BXD_geno.txt -p /home/zas1024/genotype_files/genotype/bimbam/BXD_pheno.txt -gk 1 -o BXD + + # ./gemma -g /home/zas1024/genotype_files/genotype/bimbam/BXD_geno.txt -p /home/zas1024/genotype_files/genotype/bimbam/BXD_pheno.txt -gk 1 -o BXD diff --git a/wqflask/maintenance/generate_probesetfreeze_file.py b/wqflask/maintenance/generate_probesetfreeze_file.py index b1e41e9a..e964c8ed 100644 --- a/wqflask/maintenance/generate_probesetfreeze_file.py +++ b/wqflask/maintenance/generate_probesetfreeze_file.py @@ -23,10 +23,12 @@ def get_cursor(): cursor = con.cursor() return cursor + def show_progress(process, counter): if counter % 1000 == 0: print("{}: {}".format(process, counter)) + def get_strains(cursor): cursor.execute("""select Strain.Name from Strain, StrainXRef, InbredSet @@ -42,6 +44,7 @@ def get_strains(cursor): return strains + def get_probeset_vals(cursor, dataset_name): cursor.execute(""" select ProbeSet.Id, ProbeSet.Name from ProbeSetXRef, @@ -77,6 +80,7 @@ def get_probeset_vals(cursor, dataset_name): return probeset_vals + def trim_strains(strains, probeset_vals): trimmed_strains = [] #print("probeset_vals is:", pf(probeset_vals)) @@ -89,6 +93,7 @@ def trim_strains(strains, probeset_vals): print("trimmed_strains:", pf(trimmed_strains)) return trimmed_strains + def write_data_matrix_file(strains, probeset_vals, filename): with open(filename, "wb") as fh: csv_writer = csv.writer(fh, delimiter=",", quoting=csv.QUOTE_ALL) @@ -103,10 +108,12 @@ def write_data_matrix_file(strains, probeset_vals, filename): csv_writer.writerow(row_data) show_progress("Writing", counter) + def main(): - filename = os.path.expanduser("~/gene/wqflask/maintenance/" + - "ProbeSetFreezeId_210_FullName_Eye_AXBXA_Illumina_V6.2" + - "(Oct08)_RankInv_Beta.txt") + filename = os.path.expanduser( + "~/gene/wqflask/maintenance/" + "ProbeSetFreezeId_210_FullName_Eye_AXBXA_Illumina_V6.2" + "(Oct08)_RankInv_Beta.txt") dataset_name = "Eye_AXBXA_1008_RankInv" cursor = get_cursor() @@ -117,5 +124,6 @@ def main(): trimmed_strains = trim_strains(strains, probeset_vals) write_data_matrix_file(trimmed_strains, probeset_vals, filename) + if __name__ == '__main__': main() diff --git a/wqflask/maintenance/geno_to_json.py b/wqflask/maintenance/geno_to_json.py index 7e7fd241..32e0e34b 100644 --- a/wqflask/maintenance/geno_to_json.py +++ b/wqflask/maintenance/geno_to_json.py @@ -25,11 +25,12 @@ from pprint import pformat as pf #from utility.tools import flat_files -class EmptyConfigurations(Exception): pass - +class EmptyConfigurations(Exception): + pass -class Marker(object): + +class Marker: def __init__(self): self.name = None self.chr = None @@ -37,23 +38,24 @@ class Marker(object): self.Mb = None self.genotypes = [] -class ConvertGenoFile(object): + +class ConvertGenoFile: def __init__(self, input_file, output_file): - + self.input_file = input_file self.output_file = output_file - + self.mb_exists = False self.cm_exists = False self.markers = [] - + self.latest_row_pos = None self.latest_col_pos = None - + self.latest_row_value = None self.latest_col_value = None - + def convert(self): self.haplotype_notation = { @@ -61,24 +63,23 @@ class ConvertGenoFile(object): '@pat': "0", '@het': "0.5", '@unk': "NA" - } - + } + self.configurations = {} #self.skipped_cols = 3 - - #if self.input_file.endswith(".geno.gz"): + + # if self.input_file.endswith(".geno.gz"): # print("self.input_file: ", self.input_file) # self.input_fh = gzip.open(self.input_file) - #else: + # else: self.input_fh = open(self.input_file) - + with open(self.output_file, "w") as self.output_fh: - #if self.file_type == "geno": + # if self.file_type == "geno": self.process_csv() - #elif self.file_type == "snps": + # elif self.file_type == "snps": # self.process_snps_file() - def process_csv(self): for row_count, row in enumerate(self.process_rows()): row_items = row.split("\t") @@ -100,31 +101,31 @@ class ConvertGenoFile(object): genotypes = row_items[2:] for item_count, genotype in enumerate(genotypes): if genotype.upper() in self.configurations: - this_marker.genotypes.append(self.configurations[genotype.upper()]) + this_marker.genotypes.append( + self.configurations[genotype.upper()]) else: this_marker.genotypes.append("NA") - - #print("this_marker is:", pf(this_marker.__dict__)) - #if this_marker.chr == "14": + + #print("this_marker is:", pf(this_marker.__dict__)) + # if this_marker.chr == "14": self.markers.append(this_marker.__dict__) with open(self.output_file, 'w') as fh: json.dump(self.markers, fh, indent=" ", sort_keys=True) - - # print('configurations:', str(configurations)) - #self.latest_col_pos = item_count + self.skipped_cols - #self.latest_col_value = item - - #if item_count != 0: - # self.output_fh.write(" ") - #self.output_fh.write(self.configurations[item.upper()]) - - #self.output_fh.write("\n") + # print('configurations:', str(configurations)) + #self.latest_col_pos = item_count + self.skipped_cols + #self.latest_col_value = item + + # if item_count != 0: + # self.output_fh.write(" ") + # self.output_fh.write(self.configurations[item.upper()]) + + # self.output_fh.write("\n") def process_rows(self): for self.latest_row_pos, row in enumerate(self.input_fh): - #if self.input_file.endswith(".geno.gz"): + # if self.input_file.endswith(".geno.gz"): # print("row: ", row) self.latest_row_value = row # Take care of headers @@ -171,26 +172,25 @@ class ConvertGenoFile(object): print(" Exception:", why) print(traceback.print_exc()) print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos, - convertob.latest_col_pos)) + convertob.latest_col_pos)) print(" Column is:", convertob.latest_col_value) print(" Row is:", convertob.latest_row_value) break - - #def process_snps_file(cls, snps_file, new_directory): + + # def process_snps_file(cls, snps_file, new_directory): # output_file = os.path.join(new_directory, "mouse_families.json") # print("%s -> %s" % (snps_file, output_file)) # convertob = ConvertGenoFile(input_file, output_file) - -if __name__=="__main__": +if __name__ == "__main__": Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype""" New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/json""" #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno""" #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps""" #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json") - #convertob.convert() + # convertob.convert() ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory) - #ConvertGenoFiles(Geno_Directory) - + # ConvertGenoFiles(Geno_Directory) + #process_csv(Input_File, Output_File) diff --git a/wqflask/maintenance/get_group_samplelists.py b/wqflask/maintenance/get_group_samplelists.py index 3f9d0278..0a450d3f 100644 --- a/wqflask/maintenance/get_group_samplelists.py +++ b/wqflask/maintenance/get_group_samplelists.py @@ -4,12 +4,14 @@ import gzip from base import webqtlConfig + def get_samplelist(file_type, geno_file): if file_type == "geno": return get_samplelist_from_geno(geno_file) elif file_type == "plink": return get_samplelist_from_plink(geno_file) + def get_samplelist_from_geno(genofilename): if os.path.isfile(genofilename + '.gz'): genofilename += '.gz' @@ -33,6 +35,7 @@ def get_samplelist_from_geno(genofilename): samplelist = headers[3:] return samplelist + def get_samplelist_from_plink(genofilename): genofile = open(genofilename) diff --git a/wqflask/maintenance/print_benchmark.py b/wqflask/maintenance/print_benchmark.py index b24ce4f1..9d12da8a 100644 --- a/wqflask/maintenance/print_benchmark.py +++ b/wqflask/maintenance/print_benchmark.py @@ -5,7 +5,7 @@ import time from pprint import pformat as pf -class TheCounter(object): +class TheCounter: Counters = {} def __init__(self): @@ -15,15 +15,18 @@ class TheCounter(object): self.time_took = time.time() - start_time TheCounter.Counters[self.__class__.__name__] = self.time_took + class PrintAll(TheCounter): def print_it(self, counter): print(counter) + class PrintSome(TheCounter): def print_it(self, counter): if counter % 1000 == 0: print(counter) + class PrintNone(TheCounter): def print_it(self, counter): pass @@ -37,5 +40,6 @@ def new_main(): print(pf(TheCounter.Counters)) + if __name__ == '__main__': new_main() diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py index 701b2b50..0cc963e5 100644 --- a/wqflask/maintenance/quantile_normalize.py +++ b/wqflask/maintenance/quantile_normalize.py @@ -14,42 +14,48 @@ from wqflask import app from utility.elasticsearch_tools import get_elasticsearch_connection from utility.tools import ELASTICSEARCH_HOST, ELASTICSEARCH_PORT, SQL_URI + def parse_db_uri(): """Converts a database URI to the db name, host name, user name, and password""" parsed_uri = urllib.parse.urlparse(SQL_URI) db_conn_info = dict( - db = parsed_uri.path[1:], - host = parsed_uri.hostname, - user = parsed_uri.username, - passwd = parsed_uri.password) + db=parsed_uri.path[1:], + host=parsed_uri.hostname, + user=parsed_uri.username, + passwd=parsed_uri.password) print(db_conn_info) return db_conn_info + def create_dataframe(input_file): with open(input_file) as f: ncols = len(f.readline().split("\t")) - input_array = np.loadtxt(open(input_file, "rb"), delimiter="\t", skiprows=1, usecols=list(range(1, ncols))) + input_array = np.loadtxt(open( + input_file, "rb"), delimiter="\t", skiprows=1, usecols=list(range(1, ncols))) return pd.DataFrame(input_array) -#This function taken from https://github.com/ShawnLYU/Quantile_Normalize +# This function taken from https://github.com/ShawnLYU/Quantile_Normalize + + def quantileNormalize(df_input): df = df_input.copy() - #compute rank + # compute rank dic = {} for col in df: - dic.update({col : sorted(df[col])}) + dic.update({col: sorted(df[col])}) sorted_df = pd.DataFrame(dic) - rank = sorted_df.mean(axis = 1).tolist() - #sort + rank = sorted_df.mean(axis=1).tolist() + # sort for col in df: t = np.searchsorted(np.sort(df[col]), df[col]) df[col] = [rank[i] for i in t] return df + def set_data(dataset_name): orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt" @@ -64,10 +70,10 @@ def set_data(dataset_name): trait_name = line1.split('\t')[0] for i, sample in enumerate(sample_names): this_sample = { - "name": sample, - "value": line1.split('\t')[i+1], - "qnorm": line2.split('\t')[i+1] - } + "name": sample, + "value": line1.split('\t')[i + 1], + "qnorm": line2.split('\t')[i + 1] + } sample_list.append(this_sample) query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet @@ -95,13 +101,14 @@ def set_data(dataset_name): } } + if __name__ == '__main__': Conn = MySQLdb.Connect(**parse_db_uri()) Cursor = Conn.cursor() - #es = Elasticsearch([{ + # es = Elasticsearch([{ # "host": ELASTICSEARCH_HOST, "port": ELASTICSEARCH_PORT - #}], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None + # }], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None es = get_elasticsearch_connection(for_user=False) @@ -116,8 +123,8 @@ if __name__ == '__main__': success, _ = bulk(es, set_data(sys.argv[1])) response = es.search( - index = "traits", doc_type = "trait", body = { - "query": { "match": { "name": "ENSMUSG00000028982" } } + index="traits", doc_type="trait", body={ + "query": {"match": {"name": "ENSMUSG00000028982"}} } ) diff --git a/wqflask/maintenance/set_resource_defaults.py b/wqflask/maintenance/set_resource_defaults.py index 4177c124..0f472494 100644 --- a/wqflask/maintenance/set_resource_defaults.py +++ b/wqflask/maintenance/set_resource_defaults.py @@ -37,20 +37,22 @@ import urllib.parse from utility.logger import getLogger logger = getLogger(__name__) + def parse_db_uri(): """Converts a database URI to the db name, host name, user name, and password""" parsed_uri = urllib.parse.urlparse(SQL_URI) db_conn_info = dict( - db = parsed_uri.path[1:], - host = parsed_uri.hostname, - user = parsed_uri.username, - passwd = parsed_uri.password) + db=parsed_uri.path[1:], + host=parsed_uri.hostname, + user=parsed_uri.username, + passwd=parsed_uri.password) print(db_conn_info) return db_conn_info + def insert_probeset_resources(default_owner_id): current_resources = Redis.hgetall("resources") Cursor.execute(""" SELECT @@ -63,20 +65,21 @@ def insert_probeset_resources(default_owner_id): resource_ob = {} resource_ob['name'] = resource[1] resource_ob['owner_id'] = default_owner_id - resource_ob['data'] = { "dataset" : str(resource[0])} + resource_ob['data'] = {"dataset": str(resource[0])} resource_ob['type'] = "dataset-probeset" if resource[2] < 1 and resource[3] > 0: - resource_ob['default_mask'] = { "data": "view", - "metadata": "view", - "admin": "not-admin"} + resource_ob['default_mask'] = {"data": "view", + "metadata": "view", + "admin": "not-admin"} else: - resource_ob['default_mask'] = { "data": "no-access", - "metadata": "no-access", - "admin": "not-admin"} + resource_ob['default_mask'] = {"data": "no-access", + "metadata": "no-access", + "admin": "not-admin"} resource_ob['group_masks'] = {} add_resource(resource_ob, update=False) + def insert_publish_resources(default_owner_id): current_resources = Redis.hgetall("resources") Cursor.execute(""" SELECT @@ -97,12 +100,12 @@ def insert_publish_resources(default_owner_id): else: resource_ob['name'] = str(resource[0]) resource_ob['owner_id'] = default_owner_id - resource_ob['data'] = { "dataset" : str(resource[1]) , - "trait" : str(resource[0])} + resource_ob['data'] = {"dataset": str(resource[1]), + "trait": str(resource[0])} resource_ob['type'] = "dataset-publish" - resource_ob['default_mask'] = { "data": "view", - "metadata": "view", - "admin": "not-admin"} + resource_ob['default_mask'] = {"data": "view", + "metadata": "view", + "admin": "not-admin"} resource_ob['group_masks'] = {} @@ -110,6 +113,7 @@ def insert_publish_resources(default_owner_id): else: continue + def insert_geno_resources(default_owner_id): current_resources = Redis.hgetall("resources") Cursor.execute(""" SELECT @@ -125,20 +129,21 @@ def insert_geno_resources(default_owner_id): resource_ob['owner_id'] = "c5ce8c56-78a6-474f-bcaf-7129d97f56ae" else: resource_ob['owner_id'] = default_owner_id - resource_ob['data'] = { "dataset" : str(resource[0]) } + resource_ob['data'] = {"dataset": str(resource[0])} resource_ob['type'] = "dataset-geno" if resource[2] < 1: - resource_ob['default_mask'] = { "data": "view", - "metadata": "view", - "admin": "not-admin"} + resource_ob['default_mask'] = {"data": "view", + "metadata": "view", + "admin": "not-admin"} else: - resource_ob['default_mask'] = { "data": "no-access", - "metadata": "no-access", - "admin": "not-admin"} + resource_ob['default_mask'] = {"data": "no-access", + "metadata": "no-access", + "admin": "not-admin"} resource_ob['group_masks'] = {} add_resource(resource_ob, update=False) + def insert_resources(default_owner_id): current_resources = get_resources() print("START") @@ -149,6 +154,7 @@ def insert_resources(default_owner_id): insert_probeset_resources(default_owner_id) print("AFTER PROBESET") + def main(): """Generates and outputs (as json file) the data for the main dropdown menus on the home page""" @@ -158,6 +164,7 @@ def main(): insert_resources(owner_id) + if __name__ == '__main__': Conn = MySQLdb.Connect(**parse_db_uri()) Cursor = Conn.cursor() |