diff options
author | Arthur Centeno | 2021-10-25 21:04:23 +0000 |
---|---|---|
committer | Arthur Centeno | 2021-10-25 21:04:23 +0000 |
commit | 499a80f138030c4de1629c043c8f9401a99894ea (patch) | |
tree | 449dcae965d13f966fb6d52625fbc86661c8c6a0 /wqflask/maintenance | |
parent | 6151faa9ea67af4bf4ea95fb681a9dc4319474b6 (diff) | |
parent | 700802303e5e8221a9d591ba985d6607aa61e1ce (diff) | |
download | genenetwork2-499a80f138030c4de1629c043c8f9401a99894ea.tar.gz |
Merge github.com:genenetwork/genenetwork2 into acenteno
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r-- | wqflask/maintenance/convert_dryad_to_bimbam.py | 14 | ||||
-rw-r--r-- | wqflask/maintenance/convert_geno_to_bimbam.py | 47 | ||||
-rw-r--r-- | wqflask/maintenance/gen_select_dataset.py | 49 | ||||
-rw-r--r-- | wqflask/maintenance/generate_kinship_from_bimbam.py | 31 | ||||
-rw-r--r-- | wqflask/maintenance/generate_probesetfreeze_file.py | 18 | ||||
-rw-r--r-- | wqflask/maintenance/geno_to_json.py | 85 | ||||
-rw-r--r-- | wqflask/maintenance/get_group_samplelists.py | 5 | ||||
-rw-r--r-- | wqflask/maintenance/print_benchmark.py | 10 | ||||
-rw-r--r-- | wqflask/maintenance/quantile_normalize.py | 58 | ||||
-rw-r--r-- | wqflask/maintenance/set_resource_defaults.py | 171 |
10 files changed, 343 insertions, 145 deletions
diff --git a/wqflask/maintenance/convert_dryad_to_bimbam.py b/wqflask/maintenance/convert_dryad_to_bimbam.py index e833b395..18fbb8a1 100644 --- a/wqflask/maintenance/convert_dryad_to_bimbam.py +++ b/wqflask/maintenance/convert_dryad_to_bimbam.py @@ -6,7 +6,6 @@ Convert data dryad files to a BIMBAM _geno and _snps file """ -from __future__ import print_function, division, absolute_import import sys sys.path.append("..") @@ -42,7 +41,7 @@ def read_dryad_file(filename): return geno_rows - #for i, marker in enumerate(marker_list): + # for i, marker in enumerate(marker_list): # this_row = [] # this_row.append(marker) # this_row.append("X") @@ -53,18 +52,21 @@ def read_dryad_file(filename): # this_row.append(line.split(" ")[i+2]) # print("row: " + str(i)) # geno_rows.append(this_row) - # - #return geno_rows + # + # return geno_rows + def write_bimbam_files(geno_rows): with open('/home/zas1024/cfw_data/CFW_geno.txt', 'w') as geno_fh: for row in geno_rows: geno_fh.write(", ".join(row) + "\n") + def convert_dryad_to_bimbam(filename): geno_file_rows = read_dryad_file(filename) write_bimbam_files(geno_file_rows) -if __name__=="__main__": + +if __name__ == "__main__": input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt" - convert_dryad_to_bimbam(input_filename)
\ No newline at end of file + convert_dryad_to_bimbam(input_filename) diff --git a/wqflask/maintenance/convert_geno_to_bimbam.py b/wqflask/maintenance/convert_geno_to_bimbam.py index 528b98cf..078be529 100644 --- a/wqflask/maintenance/convert_geno_to_bimbam.py +++ b/wqflask/maintenance/convert_geno_to_bimbam.py @@ -9,7 +9,6 @@ code """ -from __future__ import print_function, division, absolute_import import sys sys.path.append("..") import os @@ -21,9 +20,12 @@ import simplejson as json from pprint import pformat as pf -class EmptyConfigurations(Exception): pass -class Marker(object): +class EmptyConfigurations(Exception): + pass + + +class Marker: def __init__(self): self.name = None self.chr = None @@ -31,7 +33,8 @@ class Marker(object): self.Mb = None self.genotypes = [] -class ConvertGenoFile(object): + +class ConvertGenoFile: def __init__(self, input_file, output_files): self.input_file = input_file @@ -53,7 +56,7 @@ class ConvertGenoFile(object): '@pat': "0", '@het': "0.5", '@unk': "NA" - } + } self.configurations = {} self.input_fh = open(self.input_file) @@ -81,13 +84,14 @@ class ConvertGenoFile(object): genotypes = row_items[2:] for item_count, genotype in enumerate(genotypes): if genotype.upper().strip() in self.configurations: - this_marker.genotypes.append(self.configurations[genotype.upper().strip()]) + this_marker.genotypes.append( + self.configurations[genotype.upper().strip()]) else: this_marker.genotypes.append("NA") self.markers.append(this_marker.__dict__) - self.write_to_bimbam() + self.write_to_bimbam() def write_to_bimbam(self): with open(self.output_files[0], "w") as geno_fh: @@ -104,9 +108,11 @@ class ConvertGenoFile(object): with open(self.output_files[2], "w") as snp_fh: for marker in self.markers: if self.mb_exists: - snp_fh.write(marker['name'] +", " + str(int(float(marker['Mb'])*1000000)) + ", " + marker['chr'] + "\n") + snp_fh.write( + marker['name'] + ", " + str(int(float(marker['Mb']) * 1000000)) + ", " + marker['chr'] + "\n") else: - snp_fh.write(marker['name'] +", " + str(int(float(marker['cM'])*1000000)) + ", " + marker['chr'] + "\n") + snp_fh.write( + marker['name'] + ", " + str(int(float(marker['cM']) * 1000000)) + ", " + marker['chr'] + "\n") def get_sample_list(self, row_contents): self.sample_list = [] @@ -120,7 +126,7 @@ class ConvertGenoFile(object): self.sample_list = row_contents[3:] else: self.sample_list = row_contents[2:] - + def process_rows(self): for self.latest_row_pos, row in enumerate(self.input_fh): self.latest_row_value = row @@ -158,10 +164,14 @@ class ConvertGenoFile(object): group_name = ".".join(input_file.split('.')[:-1]) if group_name == "HSNIH-Palmer": continue - geno_output_file = os.path.join(new_directory, group_name + "_geno.txt") - pheno_output_file = os.path.join(new_directory, group_name + "_pheno.txt") - snp_output_file = os.path.join(new_directory, group_name + "_snps.txt") - output_files = [geno_output_file, pheno_output_file, snp_output_file] + geno_output_file = os.path.join( + new_directory, group_name + "_geno.txt") + pheno_output_file = os.path.join( + new_directory, group_name + "_pheno.txt") + snp_output_file = os.path.join( + new_directory, group_name + "_snps.txt") + output_files = [geno_output_file, + pheno_output_file, snp_output_file] print("%s -> %s" % ( os.path.join(old_directory, input_file), geno_output_file)) convertob = ConvertGenoFile(input_file, output_files) @@ -174,17 +184,18 @@ class ConvertGenoFile(object): print(" Exception:", why) print(traceback.print_exc()) print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos, - convertob.latest_col_pos)) + convertob.latest_col_pos)) print(" Column is:", convertob.latest_col_value) print(" Row is:", convertob.latest_row_value) break -if __name__=="__main__": + +if __name__ == "__main__": Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype""" New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/bimbam""" #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno""" #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps""" #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json") - #convertob.convert() + # convertob.convert() ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory) - #ConvertGenoFiles(Geno_Directory)
\ No newline at end of file + # ConvertGenoFiles(Geno_Directory) diff --git a/wqflask/maintenance/gen_select_dataset.py b/wqflask/maintenance/gen_select_dataset.py index 647e58a2..db65a11f 100644 --- a/wqflask/maintenance/gen_select_dataset.py +++ b/wqflask/maintenance/gen_select_dataset.py @@ -30,18 +30,10 @@ It needs to be run manually when database has been changed. Run it as # # This module is used by GeneNetwork project (www.genenetwork.org) -from __future__ import print_function, division - -#from flask import config -# -#cdict = {} -#config = config.Config(cdict).from_envvar('WQFLASK_SETTINGS') -#print("cdict is:", cdict) - import sys # NEW: Note we prepend the current path - otherwise a guix instance of GN2 may be used instead -sys.path.insert(0,'./') +sys.path.insert(0, './') # NEW: import app to avoid a circular dependency on utility.tools from wqflask import app @@ -50,7 +42,7 @@ from utility.tools import locate, locate_ignore_error, TEMPDIR, SQL_URI import MySQLdb import simplejson as json -import urlparse +import urllib.parse #import sqlalchemy as sa @@ -63,16 +55,17 @@ from pprint import pformat as pf #conn = Engine.connect() + def parse_db_uri(): """Converts a database URI to the db name, host name, user name, and password""" - parsed_uri = urlparse.urlparse(SQL_URI) + parsed_uri = urllib.parse.urlparse(SQL_URI) db_conn_info = dict( - db = parsed_uri.path[1:], - host = parsed_uri.hostname, - user = parsed_uri.username, - passwd = parsed_uri.password) + db=parsed_uri.path[1:], + host=parsed_uri.hostname, + user=parsed_uri.username, + passwd=parsed_uri.password) print(db_conn_info) return db_conn_info @@ -108,7 +101,7 @@ def get_types(groups): """Build types list""" types = {} #print("Groups: ", pf(groups)) - for species, group_dict in groups.iteritems(): + for species, group_dict in list(groups.items()): types[species] = {} for group_name, _group_full_name in group_dict: # make group an alias to shorten the code @@ -127,21 +120,23 @@ def get_types(groups): else: if not phenotypes_exist(group_name) and not genotypes_exist(group_name): types[species].pop(group_name, None) - groups[species] = tuple(group for group in groups[species] if group[0] != group_name) - else: #ZS: This whole else statement might be unnecessary, need to check + groups[species] = tuple( + group for group in groups[species] if group[0] != group_name) + else: # ZS: This whole else statement might be unnecessary, need to check types_list = build_types(species, group_name) if len(types_list) > 0: types[species][group_name] = types_list else: types[species].pop(group_name, None) - groups[species] = tuple(group for group in groups[species] if group[0] != group_name) + groups[species] = tuple( + group for group in groups[species] if group[0] != group_name) return types def phenotypes_exist(group_name): #print("group_name:", group_name) Cursor.execute("""select Name from PublishFreeze - where PublishFreeze.Name = '%s'""" % (group_name+"Publish")) + where PublishFreeze.Name = '%s'""" % (group_name + "Publish")) results = Cursor.fetchone() #print("RESULTS:", results) @@ -151,10 +146,11 @@ def phenotypes_exist(group_name): else: return False + def genotypes_exist(group_name): #print("group_name:", group_name) Cursor.execute("""select Name from GenoFreeze - where GenoFreeze.Name = '%s'""" % (group_name+"Geno")) + where GenoFreeze.Name = '%s'""" % (group_name + "Geno")) results = Cursor.fetchone() #print("RESULTS:", results) @@ -164,6 +160,7 @@ def genotypes_exist(group_name): else: return False + def build_types(species, group): """Fetches tissues @@ -192,12 +189,13 @@ def build_types(species, group): return results + def get_datasets(types): """Build datasets list""" datasets = {} - for species, group_dict in types.iteritems(): + for species, group_dict in list(types.items()): datasets[species] = {} - for group, type_list in group_dict.iteritems(): + for group, type_list in list(group_dict.items()): datasets[species][group] = {} for type_name in type_list: these_datasets = build_datasets(species, group, type_name[0]) @@ -254,7 +252,7 @@ def build_datasets(species, group, type_name): dataset_text = "%s Genotypes" % group datasets.append((dataset_id, dataset_value, dataset_text)) - else: # for mRNA expression/ProbeSet + else: # for mRNA expression/ProbeSet Cursor.execute("""select ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName from ProbeSetFreeze, ProbeFreeze, InbredSet, Tissue, Species where Species.Name = '%s' and Species.Id = InbredSet.SpeciesId and @@ -316,7 +314,8 @@ def _test_it(): datasets = build_datasets("Mouse", "BXD", "Hippocampus") #print("build_datasets:", pf(datasets)) + if __name__ == '__main__': Conn = MySQLdb.Connect(**parse_db_uri()) Cursor = Conn.cursor() - main()
\ No newline at end of file + main() diff --git a/wqflask/maintenance/generate_kinship_from_bimbam.py b/wqflask/maintenance/generate_kinship_from_bimbam.py index b53f5dda..9f01d094 100644 --- a/wqflask/maintenance/generate_kinship_from_bimbam.py +++ b/wqflask/maintenance/generate_kinship_from_bimbam.py @@ -8,20 +8,22 @@ and uses GEMMA to generate their corresponding kinship/relatedness matrix file """ -from __future__ import print_function, division, absolute_import import sys sys.path.append("..") import os import glob -class GenerateKinshipMatrices(object): + +class GenerateKinshipMatrices: def __init__(self, group_name, geno_file, pheno_file): self.group_name = group_name self.geno_file = geno_file self.pheno_file = pheno_file - + def generate_kinship(self): - gemma_command = "/gnu/store/xhzgjr0jvakxv6h3blj8z496xjig69b0-profile/bin/gemma -g " + self.geno_file + " -p " + self.pheno_file + " -gk 1 -outdir /home/zas1024/genotype_files/genotype/bimbam/ -o " + self.group_name + gemma_command = "/gnu/store/xhzgjr0jvakxv6h3blj8z496xjig69b0-profile/bin/gemma -g " + self.geno_file + \ + " -p " + self.pheno_file + \ + " -gk 1 -outdir /home/zas1024/genotype_files/genotype/bimbam/ -o " + self.group_name print("command:", gemma_command) os.system(gemma_command) @@ -34,9 +36,12 @@ class GenerateKinshipMatrices(object): group_name = ".".join(input_file.split('.')[:-1]) if group_name == "HSNIH-Palmer": continue - geno_input_file = os.path.join(bimbam_dir, group_name + "_geno.txt") - pheno_input_file = os.path.join(bimbam_dir, group_name + "_pheno.txt") - convertob = GenerateKinshipMatrices(group_name, geno_input_file, pheno_input_file) + geno_input_file = os.path.join( + bimbam_dir, group_name + "_geno.txt") + pheno_input_file = os.path.join( + bimbam_dir, group_name + "_pheno.txt") + convertob = GenerateKinshipMatrices( + group_name, geno_input_file, pheno_input_file) try: convertob.generate_kinship() except EmptyConfigurations as why: @@ -47,15 +52,15 @@ class GenerateKinshipMatrices(object): print(" Exception:", why) print(traceback.print_exc()) print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos, - convertob.latest_col_pos)) + convertob.latest_col_pos)) print(" Column is:", convertob.latest_col_value) print(" Row is:", convertob.latest_row_value) break - - -if __name__=="__main__": + + +if __name__ == "__main__": Geno_Directory = """/export/local/home/zas1024/genotype_files/genotype/""" Bimbam_Directory = """/export/local/home/zas1024/genotype_files/genotype/bimbam/""" GenerateKinshipMatrices.process_all(Geno_Directory, Bimbam_Directory) - - #./gemma -g /home/zas1024/genotype_files/genotype/bimbam/BXD_geno.txt -p /home/zas1024/genotype_files/genotype/bimbam/BXD_pheno.txt -gk 1 -o BXD
\ No newline at end of file + + # ./gemma -g /home/zas1024/genotype_files/genotype/bimbam/BXD_geno.txt -p /home/zas1024/genotype_files/genotype/bimbam/BXD_pheno.txt -gk 1 -o BXD diff --git a/wqflask/maintenance/generate_probesetfreeze_file.py b/wqflask/maintenance/generate_probesetfreeze_file.py index b7b2dc8e..e964c8ed 100644 --- a/wqflask/maintenance/generate_probesetfreeze_file.py +++ b/wqflask/maintenance/generate_probesetfreeze_file.py @@ -1,7 +1,5 @@ #!/usr/bin/python -from __future__ import absolute_import, print_function, division - import sys # sys.path.insert(0, "..") - why? @@ -25,10 +23,12 @@ def get_cursor(): cursor = con.cursor() return cursor + def show_progress(process, counter): if counter % 1000 == 0: print("{}: {}".format(process, counter)) + def get_strains(cursor): cursor.execute("""select Strain.Name from Strain, StrainXRef, InbredSet @@ -44,6 +44,7 @@ def get_strains(cursor): return strains + def get_probeset_vals(cursor, dataset_name): cursor.execute(""" select ProbeSet.Id, ProbeSet.Name from ProbeSetXRef, @@ -79,10 +80,11 @@ def get_probeset_vals(cursor, dataset_name): return probeset_vals + def trim_strains(strains, probeset_vals): trimmed_strains = [] #print("probeset_vals is:", pf(probeset_vals)) - first_probeset = list(probeset_vals.itervalues())[0] + first_probeset = list(probeset_vals.values())[0] print("\n**** first_probeset is:", pf(first_probeset)) for strain in strains: print("\n**** strain is:", pf(strain)) @@ -91,6 +93,7 @@ def trim_strains(strains, probeset_vals): print("trimmed_strains:", pf(trimmed_strains)) return trimmed_strains + def write_data_matrix_file(strains, probeset_vals, filename): with open(filename, "wb") as fh: csv_writer = csv.writer(fh, delimiter=",", quoting=csv.QUOTE_ALL) @@ -105,10 +108,12 @@ def write_data_matrix_file(strains, probeset_vals, filename): csv_writer.writerow(row_data) show_progress("Writing", counter) + def main(): - filename = os.path.expanduser("~/gene/wqflask/maintenance/" + - "ProbeSetFreezeId_210_FullName_Eye_AXBXA_Illumina_V6.2" + - "(Oct08)_RankInv_Beta.txt") + filename = os.path.expanduser( + "~/gene/wqflask/maintenance/" + "ProbeSetFreezeId_210_FullName_Eye_AXBXA_Illumina_V6.2" + "(Oct08)_RankInv_Beta.txt") dataset_name = "Eye_AXBXA_1008_RankInv" cursor = get_cursor() @@ -119,5 +124,6 @@ def main(): trimmed_strains = trim_strains(strains, probeset_vals) write_data_matrix_file(trimmed_strains, probeset_vals, filename) + if __name__ == '__main__': main() diff --git a/wqflask/maintenance/geno_to_json.py b/wqflask/maintenance/geno_to_json.py index 9579812a..32e0e34b 100644 --- a/wqflask/maintenance/geno_to_json.py +++ b/wqflask/maintenance/geno_to_json.py @@ -9,7 +9,6 @@ code """ -from __future__ import print_function, division, absolute_import import sys sys.path.append("..") import os @@ -26,11 +25,12 @@ from pprint import pformat as pf #from utility.tools import flat_files -class EmptyConfigurations(Exception): pass - +class EmptyConfigurations(Exception): + pass -class Marker(object): + +class Marker: def __init__(self): self.name = None self.chr = None @@ -38,23 +38,24 @@ class Marker(object): self.Mb = None self.genotypes = [] -class ConvertGenoFile(object): + +class ConvertGenoFile: def __init__(self, input_file, output_file): - + self.input_file = input_file self.output_file = output_file - + self.mb_exists = False self.cm_exists = False self.markers = [] - + self.latest_row_pos = None self.latest_col_pos = None - + self.latest_row_value = None self.latest_col_value = None - + def convert(self): self.haplotype_notation = { @@ -62,24 +63,23 @@ class ConvertGenoFile(object): '@pat': "0", '@het': "0.5", '@unk': "NA" - } - + } + self.configurations = {} #self.skipped_cols = 3 - - #if self.input_file.endswith(".geno.gz"): + + # if self.input_file.endswith(".geno.gz"): # print("self.input_file: ", self.input_file) # self.input_fh = gzip.open(self.input_file) - #else: + # else: self.input_fh = open(self.input_file) - + with open(self.output_file, "w") as self.output_fh: - #if self.file_type == "geno": + # if self.file_type == "geno": self.process_csv() - #elif self.file_type == "snps": + # elif self.file_type == "snps": # self.process_snps_file() - def process_csv(self): for row_count, row in enumerate(self.process_rows()): row_items = row.split("\t") @@ -101,31 +101,31 @@ class ConvertGenoFile(object): genotypes = row_items[2:] for item_count, genotype in enumerate(genotypes): if genotype.upper() in self.configurations: - this_marker.genotypes.append(self.configurations[genotype.upper()]) + this_marker.genotypes.append( + self.configurations[genotype.upper()]) else: this_marker.genotypes.append("NA") - - #print("this_marker is:", pf(this_marker.__dict__)) - #if this_marker.chr == "14": + + #print("this_marker is:", pf(this_marker.__dict__)) + # if this_marker.chr == "14": self.markers.append(this_marker.__dict__) with open(self.output_file, 'w') as fh: json.dump(self.markers, fh, indent=" ", sort_keys=True) - - # print('configurations:', str(configurations)) - #self.latest_col_pos = item_count + self.skipped_cols - #self.latest_col_value = item - - #if item_count != 0: - # self.output_fh.write(" ") - #self.output_fh.write(self.configurations[item.upper()]) - - #self.output_fh.write("\n") + # print('configurations:', str(configurations)) + #self.latest_col_pos = item_count + self.skipped_cols + #self.latest_col_value = item + + # if item_count != 0: + # self.output_fh.write(" ") + # self.output_fh.write(self.configurations[item.upper()]) + + # self.output_fh.write("\n") def process_rows(self): for self.latest_row_pos, row in enumerate(self.input_fh): - #if self.input_file.endswith(".geno.gz"): + # if self.input_file.endswith(".geno.gz"): # print("row: ", row) self.latest_row_value = row # Take care of headers @@ -172,26 +172,25 @@ class ConvertGenoFile(object): print(" Exception:", why) print(traceback.print_exc()) print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos, - convertob.latest_col_pos)) + convertob.latest_col_pos)) print(" Column is:", convertob.latest_col_value) print(" Row is:", convertob.latest_row_value) break - - #def process_snps_file(cls, snps_file, new_directory): + + # def process_snps_file(cls, snps_file, new_directory): # output_file = os.path.join(new_directory, "mouse_families.json") # print("%s -> %s" % (snps_file, output_file)) # convertob = ConvertGenoFile(input_file, output_file) - -if __name__=="__main__": +if __name__ == "__main__": Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype""" New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/json""" #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno""" #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps""" #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json") - #convertob.convert() + # convertob.convert() ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory) - #ConvertGenoFiles(Geno_Directory) - - #process_csv(Input_File, Output_File)
\ No newline at end of file + # ConvertGenoFiles(Geno_Directory) + + #process_csv(Input_File, Output_File) diff --git a/wqflask/maintenance/get_group_samplelists.py b/wqflask/maintenance/get_group_samplelists.py index fb22898a..0a450d3f 100644 --- a/wqflask/maintenance/get_group_samplelists.py +++ b/wqflask/maintenance/get_group_samplelists.py @@ -1,17 +1,17 @@ -from __future__ import absolute_import, print_function, division - import os import glob import gzip from base import webqtlConfig + def get_samplelist(file_type, geno_file): if file_type == "geno": return get_samplelist_from_geno(geno_file) elif file_type == "plink": return get_samplelist_from_plink(geno_file) + def get_samplelist_from_geno(genofilename): if os.path.isfile(genofilename + '.gz'): genofilename += '.gz' @@ -35,6 +35,7 @@ def get_samplelist_from_geno(genofilename): samplelist = headers[3:] return samplelist + def get_samplelist_from_plink(genofilename): genofile = open(genofilename) diff --git a/wqflask/maintenance/print_benchmark.py b/wqflask/maintenance/print_benchmark.py index ae327cf3..9d12da8a 100644 --- a/wqflask/maintenance/print_benchmark.py +++ b/wqflask/maintenance/print_benchmark.py @@ -1,13 +1,11 @@ #!/usr/bin/python -from __future__ import absolute_import, print_function, division - import time from pprint import pformat as pf -class TheCounter(object): +class TheCounter: Counters = {} def __init__(self): @@ -17,15 +15,18 @@ class TheCounter(object): self.time_took = time.time() - start_time TheCounter.Counters[self.__class__.__name__] = self.time_took + class PrintAll(TheCounter): def print_it(self, counter): print(counter) + class PrintSome(TheCounter): def print_it(self, counter): if counter % 1000 == 0: print(counter) + class PrintNone(TheCounter): def print_it(self, counter): pass @@ -39,5 +40,6 @@ def new_main(): print(pf(TheCounter.Counters)) + if __name__ == '__main__': - new_main()
\ No newline at end of file + new_main() diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py index 41a3aad8..0cc963e5 100644 --- a/wqflask/maintenance/quantile_normalize.py +++ b/wqflask/maintenance/quantile_normalize.py @@ -1,12 +1,7 @@ -from __future__ import absolute_import, print_function, division - import sys -sys.path.insert(0,'./') - -from itertools import izip - +sys.path.insert(0, './') import MySQLdb -import urlparse +import urllib.parse import numpy as np import pandas as pd @@ -19,48 +14,54 @@ from wqflask import app from utility.elasticsearch_tools import get_elasticsearch_connection from utility.tools import ELASTICSEARCH_HOST, ELASTICSEARCH_PORT, SQL_URI + def parse_db_uri(): """Converts a database URI to the db name, host name, user name, and password""" - parsed_uri = urlparse.urlparse(SQL_URI) + parsed_uri = urllib.parse.urlparse(SQL_URI) db_conn_info = dict( - db = parsed_uri.path[1:], - host = parsed_uri.hostname, - user = parsed_uri.username, - passwd = parsed_uri.password) + db=parsed_uri.path[1:], + host=parsed_uri.hostname, + user=parsed_uri.username, + passwd=parsed_uri.password) print(db_conn_info) return db_conn_info + def create_dataframe(input_file): with open(input_file) as f: ncols = len(f.readline().split("\t")) - input_array = np.loadtxt(open(input_file, "rb"), delimiter="\t", skiprows=1, usecols=range(1, ncols)) + input_array = np.loadtxt(open( + input_file, "rb"), delimiter="\t", skiprows=1, usecols=list(range(1, ncols))) return pd.DataFrame(input_array) -#This function taken from https://github.com/ShawnLYU/Quantile_Normalize +# This function taken from https://github.com/ShawnLYU/Quantile_Normalize + + def quantileNormalize(df_input): df = df_input.copy() - #compute rank + # compute rank dic = {} for col in df: - dic.update({col : sorted(df[col])}) + dic.update({col: sorted(df[col])}) sorted_df = pd.DataFrame(dic) - rank = sorted_df.mean(axis = 1).tolist() - #sort + rank = sorted_df.mean(axis=1).tolist() + # sort for col in df: t = np.searchsorted(np.sort(df[col]), df[col]) df[col] = [rank[i] for i in t] return df + def set_data(dataset_name): orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt" sample_list = [] with open(orig_file, 'r') as orig_fh, open('/home/zas1024/cfw_data/quant_norm.csv', 'r') as quant_fh: - for i, (line1, line2) in enumerate(izip(orig_fh, quant_fh)): + for i, (line1, line2) in enumerate(zip(orig_fh, quant_fh)): trait_dict = {} sample_list = [] if i == 0: @@ -69,10 +70,10 @@ def set_data(dataset_name): trait_name = line1.split('\t')[0] for i, sample in enumerate(sample_names): this_sample = { - "name": sample, - "value": line1.split('\t')[i+1], - "qnorm": line2.split('\t')[i+1] - } + "name": sample, + "value": line1.split('\t')[i + 1], + "qnorm": line2.split('\t')[i + 1] + } sample_list.append(this_sample) query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet @@ -100,13 +101,14 @@ def set_data(dataset_name): } } + if __name__ == '__main__': Conn = MySQLdb.Connect(**parse_db_uri()) Cursor = Conn.cursor() - #es = Elasticsearch([{ + # es = Elasticsearch([{ # "host": ELASTICSEARCH_HOST, "port": ELASTICSEARCH_PORT - #}], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None + # }], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None es = get_elasticsearch_connection(for_user=False) @@ -121,9 +123,9 @@ if __name__ == '__main__': success, _ = bulk(es, set_data(sys.argv[1])) response = es.search( - index = "traits", doc_type = "trait", body = { - "query": { "match": { "name": "ENSMUSG00000028982" } } + index="traits", doc_type="trait", body={ + "query": {"match": {"name": "ENSMUSG00000028982"}} } ) - print(response)
\ No newline at end of file + print(response) diff --git a/wqflask/maintenance/set_resource_defaults.py b/wqflask/maintenance/set_resource_defaults.py new file mode 100644 index 00000000..0f472494 --- /dev/null +++ b/wqflask/maintenance/set_resource_defaults.py @@ -0,0 +1,171 @@ +""" + +Script that sets default resource access masks for use with the DB proxy + +Defaults will be: +Owner - omni_gn +Mask - Public/non-confidential: { data: "view", + metadata: "view", + admin: "not-admin" } + Private/confidentia: { data: "no-access", + metadata: "no-access", + admin: "not-admin" } + +To run: +./bin/genenetwork2 ~/my_settings.py -c ./wqflask/maintenance/gen_select_dataset.py + +""" + +import sys +import json + +# NEW: Note we prepend the current path - otherwise a guix instance of GN2 may be used instead +sys.path.insert(0, './') + +# NEW: import app to avoid a circular dependency on utility.tools +from wqflask import app + +from utility import hmac +from utility.tools import SQL_URI +from utility.redis_tools import get_redis_conn, get_user_id, add_resource, get_resources, get_resource_info +Redis = get_redis_conn() + +import MySQLdb + +import urllib.parse + +from utility.logger import getLogger +logger = getLogger(__name__) + + +def parse_db_uri(): + """Converts a database URI to the db name, host name, user name, and password""" + + parsed_uri = urllib.parse.urlparse(SQL_URI) + + db_conn_info = dict( + db=parsed_uri.path[1:], + host=parsed_uri.hostname, + user=parsed_uri.username, + passwd=parsed_uri.password) + + print(db_conn_info) + return db_conn_info + + +def insert_probeset_resources(default_owner_id): + current_resources = Redis.hgetall("resources") + Cursor.execute(""" SELECT + ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.confidentiality, ProbeSetFreeze.public + FROM + ProbeSetFreeze""") + + resource_results = Cursor.fetchall() + for i, resource in enumerate(resource_results): + resource_ob = {} + resource_ob['name'] = resource[1] + resource_ob['owner_id'] = default_owner_id + resource_ob['data'] = {"dataset": str(resource[0])} + resource_ob['type'] = "dataset-probeset" + if resource[2] < 1 and resource[3] > 0: + resource_ob['default_mask'] = {"data": "view", + "metadata": "view", + "admin": "not-admin"} + else: + resource_ob['default_mask'] = {"data": "no-access", + "metadata": "no-access", + "admin": "not-admin"} + resource_ob['group_masks'] = {} + + add_resource(resource_ob, update=False) + + +def insert_publish_resources(default_owner_id): + current_resources = Redis.hgetall("resources") + Cursor.execute(""" SELECT + PublishXRef.Id, PublishFreeze.Id, InbredSet.InbredSetCode + FROM + PublishXRef, PublishFreeze, InbredSet, Publication + WHERE + PublishFreeze.InbredSetId = PublishXRef.InbredSetId AND + InbredSet.Id = PublishXRef.InbredSetId AND + Publication.Id = PublishXRef.PublicationId""") + + resource_results = Cursor.fetchall() + for resource in resource_results: + if resource[2]: + resource_ob = {} + if resource[2]: + resource_ob['name'] = resource[2] + "_" + str(resource[0]) + else: + resource_ob['name'] = str(resource[0]) + resource_ob['owner_id'] = default_owner_id + resource_ob['data'] = {"dataset": str(resource[1]), + "trait": str(resource[0])} + resource_ob['type'] = "dataset-publish" + resource_ob['default_mask'] = {"data": "view", + "metadata": "view", + "admin": "not-admin"} + + resource_ob['group_masks'] = {} + + add_resource(resource_ob, update=False) + else: + continue + + +def insert_geno_resources(default_owner_id): + current_resources = Redis.hgetall("resources") + Cursor.execute(""" SELECT + GenoFreeze.Id, GenoFreeze.ShortName, GenoFreeze.confidentiality + FROM + GenoFreeze""") + + resource_results = Cursor.fetchall() + for i, resource in enumerate(resource_results): + resource_ob = {} + resource_ob['name'] = resource[1] + if resource[1] == "HET3-ITPGeno": + resource_ob['owner_id'] = "c5ce8c56-78a6-474f-bcaf-7129d97f56ae" + else: + resource_ob['owner_id'] = default_owner_id + resource_ob['data'] = {"dataset": str(resource[0])} + resource_ob['type'] = "dataset-geno" + if resource[2] < 1: + resource_ob['default_mask'] = {"data": "view", + "metadata": "view", + "admin": "not-admin"} + else: + resource_ob['default_mask'] = {"data": "no-access", + "metadata": "no-access", + "admin": "not-admin"} + resource_ob['group_masks'] = {} + + add_resource(resource_ob, update=False) + + +def insert_resources(default_owner_id): + current_resources = get_resources() + print("START") + insert_publish_resources(default_owner_id) + print("AFTER PUBLISH") + insert_geno_resources(default_owner_id) + print("AFTER GENO") + insert_probeset_resources(default_owner_id) + print("AFTER PROBESET") + + +def main(): + """Generates and outputs (as json file) the data for the main dropdown menus on the home page""" + + Redis.delete("resources") + + owner_id = "c5ce8c56-78a6-474f-bcaf-7129d97f56ae" + + insert_resources(owner_id) + + +if __name__ == '__main__': + Conn = MySQLdb.Connect(**parse_db_uri()) + Cursor = Conn.cursor() + main() |