aboutsummaryrefslogtreecommitdiff
path: root/wqflask/maintenance
diff options
context:
space:
mode:
authorzsloan2021-10-18 17:50:26 +0000
committerzsloan2021-10-18 17:50:26 +0000
commite36eaf0003a598bc5aa688803dd1b36c24a4c051 (patch)
treea59b7dadf02241575eb0774f97c6048e2425c053 /wqflask/maintenance
parentbd421438f1f0b4de913fa40cd49cfcda27e6b16f (diff)
parent04f3d13aceeaec2e52b94037d59f08ed6dc6a8bb (diff)
downloadgenenetwork2-e36eaf0003a598bc5aa688803dd1b36c24a4c051.tar.gz
Merge branch 'testing' of github.com:genenetwork/genenetwork2 into feature/remove_trait_creation_from_search
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r--wqflask/maintenance/convert_dryad_to_bimbam.py11
-rw-r--r--wqflask/maintenance/convert_geno_to_bimbam.py46
-rw-r--r--wqflask/maintenance/gen_select_dataset.py27
-rw-r--r--wqflask/maintenance/generate_kinship_from_bimbam.py30
-rw-r--r--wqflask/maintenance/generate_probesetfreeze_file.py14
-rw-r--r--wqflask/maintenance/geno_to_json.py82
-rw-r--r--wqflask/maintenance/get_group_samplelists.py3
-rw-r--r--wqflask/maintenance/print_benchmark.py6
-rw-r--r--wqflask/maintenance/quantile_normalize.py43
-rw-r--r--wqflask/maintenance/set_resource_defaults.py53
10 files changed, 186 insertions, 129 deletions
diff --git a/wqflask/maintenance/convert_dryad_to_bimbam.py b/wqflask/maintenance/convert_dryad_to_bimbam.py
index 12ce35e9..18fbb8a1 100644
--- a/wqflask/maintenance/convert_dryad_to_bimbam.py
+++ b/wqflask/maintenance/convert_dryad_to_bimbam.py
@@ -41,7 +41,7 @@ def read_dryad_file(filename):
return geno_rows
- #for i, marker in enumerate(marker_list):
+ # for i, marker in enumerate(marker_list):
# this_row = []
# this_row.append(marker)
# this_row.append("X")
@@ -52,18 +52,21 @@ def read_dryad_file(filename):
# this_row.append(line.split(" ")[i+2])
# print("row: " + str(i))
# geno_rows.append(this_row)
- #
- #return geno_rows
+ #
+ # return geno_rows
+
def write_bimbam_files(geno_rows):
with open('/home/zas1024/cfw_data/CFW_geno.txt', 'w') as geno_fh:
for row in geno_rows:
geno_fh.write(", ".join(row) + "\n")
+
def convert_dryad_to_bimbam(filename):
geno_file_rows = read_dryad_file(filename)
write_bimbam_files(geno_file_rows)
-if __name__=="__main__":
+
+if __name__ == "__main__":
input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt"
convert_dryad_to_bimbam(input_filename)
diff --git a/wqflask/maintenance/convert_geno_to_bimbam.py b/wqflask/maintenance/convert_geno_to_bimbam.py
index d49742f2..078be529 100644
--- a/wqflask/maintenance/convert_geno_to_bimbam.py
+++ b/wqflask/maintenance/convert_geno_to_bimbam.py
@@ -20,9 +20,12 @@ import simplejson as json
from pprint import pformat as pf
-class EmptyConfigurations(Exception): pass
-class Marker(object):
+class EmptyConfigurations(Exception):
+ pass
+
+
+class Marker:
def __init__(self):
self.name = None
self.chr = None
@@ -30,7 +33,8 @@ class Marker(object):
self.Mb = None
self.genotypes = []
-class ConvertGenoFile(object):
+
+class ConvertGenoFile:
def __init__(self, input_file, output_files):
self.input_file = input_file
@@ -52,7 +56,7 @@ class ConvertGenoFile(object):
'@pat': "0",
'@het': "0.5",
'@unk': "NA"
- }
+ }
self.configurations = {}
self.input_fh = open(self.input_file)
@@ -80,13 +84,14 @@ class ConvertGenoFile(object):
genotypes = row_items[2:]
for item_count, genotype in enumerate(genotypes):
if genotype.upper().strip() in self.configurations:
- this_marker.genotypes.append(self.configurations[genotype.upper().strip()])
+ this_marker.genotypes.append(
+ self.configurations[genotype.upper().strip()])
else:
this_marker.genotypes.append("NA")
self.markers.append(this_marker.__dict__)
- self.write_to_bimbam()
+ self.write_to_bimbam()
def write_to_bimbam(self):
with open(self.output_files[0], "w") as geno_fh:
@@ -103,9 +108,11 @@ class ConvertGenoFile(object):
with open(self.output_files[2], "w") as snp_fh:
for marker in self.markers:
if self.mb_exists:
- snp_fh.write(marker['name'] +", " + str(int(float(marker['Mb'])*1000000)) + ", " + marker['chr'] + "\n")
+ snp_fh.write(
+ marker['name'] + ", " + str(int(float(marker['Mb']) * 1000000)) + ", " + marker['chr'] + "\n")
else:
- snp_fh.write(marker['name'] +", " + str(int(float(marker['cM'])*1000000)) + ", " + marker['chr'] + "\n")
+ snp_fh.write(
+ marker['name'] + ", " + str(int(float(marker['cM']) * 1000000)) + ", " + marker['chr'] + "\n")
def get_sample_list(self, row_contents):
self.sample_list = []
@@ -119,7 +126,7 @@ class ConvertGenoFile(object):
self.sample_list = row_contents[3:]
else:
self.sample_list = row_contents[2:]
-
+
def process_rows(self):
for self.latest_row_pos, row in enumerate(self.input_fh):
self.latest_row_value = row
@@ -157,10 +164,14 @@ class ConvertGenoFile(object):
group_name = ".".join(input_file.split('.')[:-1])
if group_name == "HSNIH-Palmer":
continue
- geno_output_file = os.path.join(new_directory, group_name + "_geno.txt")
- pheno_output_file = os.path.join(new_directory, group_name + "_pheno.txt")
- snp_output_file = os.path.join(new_directory, group_name + "_snps.txt")
- output_files = [geno_output_file, pheno_output_file, snp_output_file]
+ geno_output_file = os.path.join(
+ new_directory, group_name + "_geno.txt")
+ pheno_output_file = os.path.join(
+ new_directory, group_name + "_pheno.txt")
+ snp_output_file = os.path.join(
+ new_directory, group_name + "_snps.txt")
+ output_files = [geno_output_file,
+ pheno_output_file, snp_output_file]
print("%s -> %s" % (
os.path.join(old_directory, input_file), geno_output_file))
convertob = ConvertGenoFile(input_file, output_files)
@@ -173,17 +184,18 @@ class ConvertGenoFile(object):
print(" Exception:", why)
print(traceback.print_exc())
print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos,
- convertob.latest_col_pos))
+ convertob.latest_col_pos))
print(" Column is:", convertob.latest_col_value)
print(" Row is:", convertob.latest_row_value)
break
-if __name__=="__main__":
+
+if __name__ == "__main__":
Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype"""
New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/bimbam"""
#Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
#Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
#convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
- #convertob.convert()
+ # convertob.convert()
ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
- #ConvertGenoFiles(Geno_Directory)
+ # ConvertGenoFiles(Geno_Directory)
diff --git a/wqflask/maintenance/gen_select_dataset.py b/wqflask/maintenance/gen_select_dataset.py
index 544e2fd1..db65a11f 100644
--- a/wqflask/maintenance/gen_select_dataset.py
+++ b/wqflask/maintenance/gen_select_dataset.py
@@ -55,16 +55,17 @@ from pprint import pformat as pf
#conn = Engine.connect()
+
def parse_db_uri():
"""Converts a database URI to the db name, host name, user name, and password"""
parsed_uri = urllib.parse.urlparse(SQL_URI)
db_conn_info = dict(
- db = parsed_uri.path[1:],
- host = parsed_uri.hostname,
- user = parsed_uri.username,
- passwd = parsed_uri.password)
+ db=parsed_uri.path[1:],
+ host=parsed_uri.hostname,
+ user=parsed_uri.username,
+ passwd=parsed_uri.password)
print(db_conn_info)
return db_conn_info
@@ -119,21 +120,23 @@ def get_types(groups):
else:
if not phenotypes_exist(group_name) and not genotypes_exist(group_name):
types[species].pop(group_name, None)
- groups[species] = tuple(group for group in groups[species] if group[0] != group_name)
- else: #ZS: This whole else statement might be unnecessary, need to check
+ groups[species] = tuple(
+ group for group in groups[species] if group[0] != group_name)
+ else: # ZS: This whole else statement might be unnecessary, need to check
types_list = build_types(species, group_name)
if len(types_list) > 0:
types[species][group_name] = types_list
else:
types[species].pop(group_name, None)
- groups[species] = tuple(group for group in groups[species] if group[0] != group_name)
+ groups[species] = tuple(
+ group for group in groups[species] if group[0] != group_name)
return types
def phenotypes_exist(group_name):
#print("group_name:", group_name)
Cursor.execute("""select Name from PublishFreeze
- where PublishFreeze.Name = '%s'""" % (group_name+"Publish"))
+ where PublishFreeze.Name = '%s'""" % (group_name + "Publish"))
results = Cursor.fetchone()
#print("RESULTS:", results)
@@ -143,10 +146,11 @@ def phenotypes_exist(group_name):
else:
return False
+
def genotypes_exist(group_name):
#print("group_name:", group_name)
Cursor.execute("""select Name from GenoFreeze
- where GenoFreeze.Name = '%s'""" % (group_name+"Geno"))
+ where GenoFreeze.Name = '%s'""" % (group_name + "Geno"))
results = Cursor.fetchone()
#print("RESULTS:", results)
@@ -156,6 +160,7 @@ def genotypes_exist(group_name):
else:
return False
+
def build_types(species, group):
"""Fetches tissues
@@ -184,6 +189,7 @@ def build_types(species, group):
return results
+
def get_datasets(types):
"""Build datasets list"""
datasets = {}
@@ -246,7 +252,7 @@ def build_datasets(species, group, type_name):
dataset_text = "%s Genotypes" % group
datasets.append((dataset_id, dataset_value, dataset_text))
- else: # for mRNA expression/ProbeSet
+ else: # for mRNA expression/ProbeSet
Cursor.execute("""select ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName from
ProbeSetFreeze, ProbeFreeze, InbredSet, Tissue, Species where
Species.Name = '%s' and Species.Id = InbredSet.SpeciesId and
@@ -308,6 +314,7 @@ def _test_it():
datasets = build_datasets("Mouse", "BXD", "Hippocampus")
#print("build_datasets:", pf(datasets))
+
if __name__ == '__main__':
Conn = MySQLdb.Connect(**parse_db_uri())
Cursor = Conn.cursor()
diff --git a/wqflask/maintenance/generate_kinship_from_bimbam.py b/wqflask/maintenance/generate_kinship_from_bimbam.py
index 60257b28..9f01d094 100644
--- a/wqflask/maintenance/generate_kinship_from_bimbam.py
+++ b/wqflask/maintenance/generate_kinship_from_bimbam.py
@@ -13,14 +13,17 @@ sys.path.append("..")
import os
import glob
-class GenerateKinshipMatrices(object):
+
+class GenerateKinshipMatrices:
def __init__(self, group_name, geno_file, pheno_file):
self.group_name = group_name
self.geno_file = geno_file
self.pheno_file = pheno_file
-
+
def generate_kinship(self):
- gemma_command = "/gnu/store/xhzgjr0jvakxv6h3blj8z496xjig69b0-profile/bin/gemma -g " + self.geno_file + " -p " + self.pheno_file + " -gk 1 -outdir /home/zas1024/genotype_files/genotype/bimbam/ -o " + self.group_name
+ gemma_command = "/gnu/store/xhzgjr0jvakxv6h3blj8z496xjig69b0-profile/bin/gemma -g " + self.geno_file + \
+ " -p " + self.pheno_file + \
+ " -gk 1 -outdir /home/zas1024/genotype_files/genotype/bimbam/ -o " + self.group_name
print("command:", gemma_command)
os.system(gemma_command)
@@ -33,9 +36,12 @@ class GenerateKinshipMatrices(object):
group_name = ".".join(input_file.split('.')[:-1])
if group_name == "HSNIH-Palmer":
continue
- geno_input_file = os.path.join(bimbam_dir, group_name + "_geno.txt")
- pheno_input_file = os.path.join(bimbam_dir, group_name + "_pheno.txt")
- convertob = GenerateKinshipMatrices(group_name, geno_input_file, pheno_input_file)
+ geno_input_file = os.path.join(
+ bimbam_dir, group_name + "_geno.txt")
+ pheno_input_file = os.path.join(
+ bimbam_dir, group_name + "_pheno.txt")
+ convertob = GenerateKinshipMatrices(
+ group_name, geno_input_file, pheno_input_file)
try:
convertob.generate_kinship()
except EmptyConfigurations as why:
@@ -46,15 +52,15 @@ class GenerateKinshipMatrices(object):
print(" Exception:", why)
print(traceback.print_exc())
print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos,
- convertob.latest_col_pos))
+ convertob.latest_col_pos))
print(" Column is:", convertob.latest_col_value)
print(" Row is:", convertob.latest_row_value)
break
-
-
-if __name__=="__main__":
+
+
+if __name__ == "__main__":
Geno_Directory = """/export/local/home/zas1024/genotype_files/genotype/"""
Bimbam_Directory = """/export/local/home/zas1024/genotype_files/genotype/bimbam/"""
GenerateKinshipMatrices.process_all(Geno_Directory, Bimbam_Directory)
-
- #./gemma -g /home/zas1024/genotype_files/genotype/bimbam/BXD_geno.txt -p /home/zas1024/genotype_files/genotype/bimbam/BXD_pheno.txt -gk 1 -o BXD
+
+ # ./gemma -g /home/zas1024/genotype_files/genotype/bimbam/BXD_geno.txt -p /home/zas1024/genotype_files/genotype/bimbam/BXD_pheno.txt -gk 1 -o BXD
diff --git a/wqflask/maintenance/generate_probesetfreeze_file.py b/wqflask/maintenance/generate_probesetfreeze_file.py
index b1e41e9a..e964c8ed 100644
--- a/wqflask/maintenance/generate_probesetfreeze_file.py
+++ b/wqflask/maintenance/generate_probesetfreeze_file.py
@@ -23,10 +23,12 @@ def get_cursor():
cursor = con.cursor()
return cursor
+
def show_progress(process, counter):
if counter % 1000 == 0:
print("{}: {}".format(process, counter))
+
def get_strains(cursor):
cursor.execute("""select Strain.Name
from Strain, StrainXRef, InbredSet
@@ -42,6 +44,7 @@ def get_strains(cursor):
return strains
+
def get_probeset_vals(cursor, dataset_name):
cursor.execute(""" select ProbeSet.Id, ProbeSet.Name
from ProbeSetXRef,
@@ -77,6 +80,7 @@ def get_probeset_vals(cursor, dataset_name):
return probeset_vals
+
def trim_strains(strains, probeset_vals):
trimmed_strains = []
#print("probeset_vals is:", pf(probeset_vals))
@@ -89,6 +93,7 @@ def trim_strains(strains, probeset_vals):
print("trimmed_strains:", pf(trimmed_strains))
return trimmed_strains
+
def write_data_matrix_file(strains, probeset_vals, filename):
with open(filename, "wb") as fh:
csv_writer = csv.writer(fh, delimiter=",", quoting=csv.QUOTE_ALL)
@@ -103,10 +108,12 @@ def write_data_matrix_file(strains, probeset_vals, filename):
csv_writer.writerow(row_data)
show_progress("Writing", counter)
+
def main():
- filename = os.path.expanduser("~/gene/wqflask/maintenance/" +
- "ProbeSetFreezeId_210_FullName_Eye_AXBXA_Illumina_V6.2" +
- "(Oct08)_RankInv_Beta.txt")
+ filename = os.path.expanduser(
+ "~/gene/wqflask/maintenance/"
+ "ProbeSetFreezeId_210_FullName_Eye_AXBXA_Illumina_V6.2"
+ "(Oct08)_RankInv_Beta.txt")
dataset_name = "Eye_AXBXA_1008_RankInv"
cursor = get_cursor()
@@ -117,5 +124,6 @@ def main():
trimmed_strains = trim_strains(strains, probeset_vals)
write_data_matrix_file(trimmed_strains, probeset_vals, filename)
+
if __name__ == '__main__':
main()
diff --git a/wqflask/maintenance/geno_to_json.py b/wqflask/maintenance/geno_to_json.py
index 7e7fd241..32e0e34b 100644
--- a/wqflask/maintenance/geno_to_json.py
+++ b/wqflask/maintenance/geno_to_json.py
@@ -25,11 +25,12 @@ from pprint import pformat as pf
#from utility.tools import flat_files
-class EmptyConfigurations(Exception): pass
-
+class EmptyConfigurations(Exception):
+ pass
-class Marker(object):
+
+class Marker:
def __init__(self):
self.name = None
self.chr = None
@@ -37,23 +38,24 @@ class Marker(object):
self.Mb = None
self.genotypes = []
-class ConvertGenoFile(object):
+
+class ConvertGenoFile:
def __init__(self, input_file, output_file):
-
+
self.input_file = input_file
self.output_file = output_file
-
+
self.mb_exists = False
self.cm_exists = False
self.markers = []
-
+
self.latest_row_pos = None
self.latest_col_pos = None
-
+
self.latest_row_value = None
self.latest_col_value = None
-
+
def convert(self):
self.haplotype_notation = {
@@ -61,24 +63,23 @@ class ConvertGenoFile(object):
'@pat': "0",
'@het': "0.5",
'@unk': "NA"
- }
-
+ }
+
self.configurations = {}
#self.skipped_cols = 3
-
- #if self.input_file.endswith(".geno.gz"):
+
+ # if self.input_file.endswith(".geno.gz"):
# print("self.input_file: ", self.input_file)
# self.input_fh = gzip.open(self.input_file)
- #else:
+ # else:
self.input_fh = open(self.input_file)
-
+
with open(self.output_file, "w") as self.output_fh:
- #if self.file_type == "geno":
+ # if self.file_type == "geno":
self.process_csv()
- #elif self.file_type == "snps":
+ # elif self.file_type == "snps":
# self.process_snps_file()
-
def process_csv(self):
for row_count, row in enumerate(self.process_rows()):
row_items = row.split("\t")
@@ -100,31 +101,31 @@ class ConvertGenoFile(object):
genotypes = row_items[2:]
for item_count, genotype in enumerate(genotypes):
if genotype.upper() in self.configurations:
- this_marker.genotypes.append(self.configurations[genotype.upper()])
+ this_marker.genotypes.append(
+ self.configurations[genotype.upper()])
else:
this_marker.genotypes.append("NA")
-
- #print("this_marker is:", pf(this_marker.__dict__))
- #if this_marker.chr == "14":
+
+ #print("this_marker is:", pf(this_marker.__dict__))
+ # if this_marker.chr == "14":
self.markers.append(this_marker.__dict__)
with open(self.output_file, 'w') as fh:
json.dump(self.markers, fh, indent=" ", sort_keys=True)
-
- # print('configurations:', str(configurations))
- #self.latest_col_pos = item_count + self.skipped_cols
- #self.latest_col_value = item
-
- #if item_count != 0:
- # self.output_fh.write(" ")
- #self.output_fh.write(self.configurations[item.upper()])
-
- #self.output_fh.write("\n")
+ # print('configurations:', str(configurations))
+ #self.latest_col_pos = item_count + self.skipped_cols
+ #self.latest_col_value = item
+
+ # if item_count != 0:
+ # self.output_fh.write(" ")
+ # self.output_fh.write(self.configurations[item.upper()])
+
+ # self.output_fh.write("\n")
def process_rows(self):
for self.latest_row_pos, row in enumerate(self.input_fh):
- #if self.input_file.endswith(".geno.gz"):
+ # if self.input_file.endswith(".geno.gz"):
# print("row: ", row)
self.latest_row_value = row
# Take care of headers
@@ -171,26 +172,25 @@ class ConvertGenoFile(object):
print(" Exception:", why)
print(traceback.print_exc())
print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos,
- convertob.latest_col_pos))
+ convertob.latest_col_pos))
print(" Column is:", convertob.latest_col_value)
print(" Row is:", convertob.latest_row_value)
break
-
- #def process_snps_file(cls, snps_file, new_directory):
+
+ # def process_snps_file(cls, snps_file, new_directory):
# output_file = os.path.join(new_directory, "mouse_families.json")
# print("%s -> %s" % (snps_file, output_file))
# convertob = ConvertGenoFile(input_file, output_file)
-
-if __name__=="__main__":
+if __name__ == "__main__":
Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype"""
New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/json"""
#Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
#Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
#convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
- #convertob.convert()
+ # convertob.convert()
ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
- #ConvertGenoFiles(Geno_Directory)
-
+ # ConvertGenoFiles(Geno_Directory)
+
#process_csv(Input_File, Output_File)
diff --git a/wqflask/maintenance/get_group_samplelists.py b/wqflask/maintenance/get_group_samplelists.py
index 3f9d0278..0a450d3f 100644
--- a/wqflask/maintenance/get_group_samplelists.py
+++ b/wqflask/maintenance/get_group_samplelists.py
@@ -4,12 +4,14 @@ import gzip
from base import webqtlConfig
+
def get_samplelist(file_type, geno_file):
if file_type == "geno":
return get_samplelist_from_geno(geno_file)
elif file_type == "plink":
return get_samplelist_from_plink(geno_file)
+
def get_samplelist_from_geno(genofilename):
if os.path.isfile(genofilename + '.gz'):
genofilename += '.gz'
@@ -33,6 +35,7 @@ def get_samplelist_from_geno(genofilename):
samplelist = headers[3:]
return samplelist
+
def get_samplelist_from_plink(genofilename):
genofile = open(genofilename)
diff --git a/wqflask/maintenance/print_benchmark.py b/wqflask/maintenance/print_benchmark.py
index b24ce4f1..9d12da8a 100644
--- a/wqflask/maintenance/print_benchmark.py
+++ b/wqflask/maintenance/print_benchmark.py
@@ -5,7 +5,7 @@ import time
from pprint import pformat as pf
-class TheCounter(object):
+class TheCounter:
Counters = {}
def __init__(self):
@@ -15,15 +15,18 @@ class TheCounter(object):
self.time_took = time.time() - start_time
TheCounter.Counters[self.__class__.__name__] = self.time_took
+
class PrintAll(TheCounter):
def print_it(self, counter):
print(counter)
+
class PrintSome(TheCounter):
def print_it(self, counter):
if counter % 1000 == 0:
print(counter)
+
class PrintNone(TheCounter):
def print_it(self, counter):
pass
@@ -37,5 +40,6 @@ def new_main():
print(pf(TheCounter.Counters))
+
if __name__ == '__main__':
new_main()
diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py
index 701b2b50..0cc963e5 100644
--- a/wqflask/maintenance/quantile_normalize.py
+++ b/wqflask/maintenance/quantile_normalize.py
@@ -14,42 +14,48 @@ from wqflask import app
from utility.elasticsearch_tools import get_elasticsearch_connection
from utility.tools import ELASTICSEARCH_HOST, ELASTICSEARCH_PORT, SQL_URI
+
def parse_db_uri():
"""Converts a database URI to the db name, host name, user name, and password"""
parsed_uri = urllib.parse.urlparse(SQL_URI)
db_conn_info = dict(
- db = parsed_uri.path[1:],
- host = parsed_uri.hostname,
- user = parsed_uri.username,
- passwd = parsed_uri.password)
+ db=parsed_uri.path[1:],
+ host=parsed_uri.hostname,
+ user=parsed_uri.username,
+ passwd=parsed_uri.password)
print(db_conn_info)
return db_conn_info
+
def create_dataframe(input_file):
with open(input_file) as f:
ncols = len(f.readline().split("\t"))
- input_array = np.loadtxt(open(input_file, "rb"), delimiter="\t", skiprows=1, usecols=list(range(1, ncols)))
+ input_array = np.loadtxt(open(
+ input_file, "rb"), delimiter="\t", skiprows=1, usecols=list(range(1, ncols)))
return pd.DataFrame(input_array)
-#This function taken from https://github.com/ShawnLYU/Quantile_Normalize
+# This function taken from https://github.com/ShawnLYU/Quantile_Normalize
+
+
def quantileNormalize(df_input):
df = df_input.copy()
- #compute rank
+ # compute rank
dic = {}
for col in df:
- dic.update({col : sorted(df[col])})
+ dic.update({col: sorted(df[col])})
sorted_df = pd.DataFrame(dic)
- rank = sorted_df.mean(axis = 1).tolist()
- #sort
+ rank = sorted_df.mean(axis=1).tolist()
+ # sort
for col in df:
t = np.searchsorted(np.sort(df[col]), df[col])
df[col] = [rank[i] for i in t]
return df
+
def set_data(dataset_name):
orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt"
@@ -64,10 +70,10 @@ def set_data(dataset_name):
trait_name = line1.split('\t')[0]
for i, sample in enumerate(sample_names):
this_sample = {
- "name": sample,
- "value": line1.split('\t')[i+1],
- "qnorm": line2.split('\t')[i+1]
- }
+ "name": sample,
+ "value": line1.split('\t')[i + 1],
+ "qnorm": line2.split('\t')[i + 1]
+ }
sample_list.append(this_sample)
query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName
FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet
@@ -95,13 +101,14 @@ def set_data(dataset_name):
}
}
+
if __name__ == '__main__':
Conn = MySQLdb.Connect(**parse_db_uri())
Cursor = Conn.cursor()
- #es = Elasticsearch([{
+ # es = Elasticsearch([{
# "host": ELASTICSEARCH_HOST, "port": ELASTICSEARCH_PORT
- #}], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None
+ # }], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None
es = get_elasticsearch_connection(for_user=False)
@@ -116,8 +123,8 @@ if __name__ == '__main__':
success, _ = bulk(es, set_data(sys.argv[1]))
response = es.search(
- index = "traits", doc_type = "trait", body = {
- "query": { "match": { "name": "ENSMUSG00000028982" } }
+ index="traits", doc_type="trait", body={
+ "query": {"match": {"name": "ENSMUSG00000028982"}}
}
)
diff --git a/wqflask/maintenance/set_resource_defaults.py b/wqflask/maintenance/set_resource_defaults.py
index 4177c124..0f472494 100644
--- a/wqflask/maintenance/set_resource_defaults.py
+++ b/wqflask/maintenance/set_resource_defaults.py
@@ -37,20 +37,22 @@ import urllib.parse
from utility.logger import getLogger
logger = getLogger(__name__)
+
def parse_db_uri():
"""Converts a database URI to the db name, host name, user name, and password"""
parsed_uri = urllib.parse.urlparse(SQL_URI)
db_conn_info = dict(
- db = parsed_uri.path[1:],
- host = parsed_uri.hostname,
- user = parsed_uri.username,
- passwd = parsed_uri.password)
+ db=parsed_uri.path[1:],
+ host=parsed_uri.hostname,
+ user=parsed_uri.username,
+ passwd=parsed_uri.password)
print(db_conn_info)
return db_conn_info
+
def insert_probeset_resources(default_owner_id):
current_resources = Redis.hgetall("resources")
Cursor.execute(""" SELECT
@@ -63,20 +65,21 @@ def insert_probeset_resources(default_owner_id):
resource_ob = {}
resource_ob['name'] = resource[1]
resource_ob['owner_id'] = default_owner_id
- resource_ob['data'] = { "dataset" : str(resource[0])}
+ resource_ob['data'] = {"dataset": str(resource[0])}
resource_ob['type'] = "dataset-probeset"
if resource[2] < 1 and resource[3] > 0:
- resource_ob['default_mask'] = { "data": "view",
- "metadata": "view",
- "admin": "not-admin"}
+ resource_ob['default_mask'] = {"data": "view",
+ "metadata": "view",
+ "admin": "not-admin"}
else:
- resource_ob['default_mask'] = { "data": "no-access",
- "metadata": "no-access",
- "admin": "not-admin"}
+ resource_ob['default_mask'] = {"data": "no-access",
+ "metadata": "no-access",
+ "admin": "not-admin"}
resource_ob['group_masks'] = {}
add_resource(resource_ob, update=False)
+
def insert_publish_resources(default_owner_id):
current_resources = Redis.hgetall("resources")
Cursor.execute(""" SELECT
@@ -97,12 +100,12 @@ def insert_publish_resources(default_owner_id):
else:
resource_ob['name'] = str(resource[0])
resource_ob['owner_id'] = default_owner_id
- resource_ob['data'] = { "dataset" : str(resource[1]) ,
- "trait" : str(resource[0])}
+ resource_ob['data'] = {"dataset": str(resource[1]),
+ "trait": str(resource[0])}
resource_ob['type'] = "dataset-publish"
- resource_ob['default_mask'] = { "data": "view",
- "metadata": "view",
- "admin": "not-admin"}
+ resource_ob['default_mask'] = {"data": "view",
+ "metadata": "view",
+ "admin": "not-admin"}
resource_ob['group_masks'] = {}
@@ -110,6 +113,7 @@ def insert_publish_resources(default_owner_id):
else:
continue
+
def insert_geno_resources(default_owner_id):
current_resources = Redis.hgetall("resources")
Cursor.execute(""" SELECT
@@ -125,20 +129,21 @@ def insert_geno_resources(default_owner_id):
resource_ob['owner_id'] = "c5ce8c56-78a6-474f-bcaf-7129d97f56ae"
else:
resource_ob['owner_id'] = default_owner_id
- resource_ob['data'] = { "dataset" : str(resource[0]) }
+ resource_ob['data'] = {"dataset": str(resource[0])}
resource_ob['type'] = "dataset-geno"
if resource[2] < 1:
- resource_ob['default_mask'] = { "data": "view",
- "metadata": "view",
- "admin": "not-admin"}
+ resource_ob['default_mask'] = {"data": "view",
+ "metadata": "view",
+ "admin": "not-admin"}
else:
- resource_ob['default_mask'] = { "data": "no-access",
- "metadata": "no-access",
- "admin": "not-admin"}
+ resource_ob['default_mask'] = {"data": "no-access",
+ "metadata": "no-access",
+ "admin": "not-admin"}
resource_ob['group_masks'] = {}
add_resource(resource_ob, update=False)
+
def insert_resources(default_owner_id):
current_resources = get_resources()
print("START")
@@ -149,6 +154,7 @@ def insert_resources(default_owner_id):
insert_probeset_resources(default_owner_id)
print("AFTER PROBESET")
+
def main():
"""Generates and outputs (as json file) the data for the main dropdown menus on the home page"""
@@ -158,6 +164,7 @@ def main():
insert_resources(owner_id)
+
if __name__ == '__main__':
Conn = MySQLdb.Connect(**parse_db_uri())
Cursor = Conn.cursor()