aboutsummaryrefslogtreecommitdiff
path: root/wqflask/maintenance
diff options
context:
space:
mode:
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r--wqflask/maintenance/convert_dryad_to_bimbam.py14
-rw-r--r--wqflask/maintenance/convert_geno_to_bimbam.py47
-rw-r--r--wqflask/maintenance/gen_select_dataset.py49
-rw-r--r--wqflask/maintenance/generate_kinship_from_bimbam.py31
-rw-r--r--wqflask/maintenance/generate_probesetfreeze_file.py18
-rw-r--r--wqflask/maintenance/geno_to_json.py85
-rw-r--r--wqflask/maintenance/get_group_samplelists.py5
-rw-r--r--wqflask/maintenance/print_benchmark.py10
-rw-r--r--wqflask/maintenance/quantile_normalize.py58
-rw-r--r--wqflask/maintenance/set_resource_defaults.py171
10 files changed, 343 insertions, 145 deletions
diff --git a/wqflask/maintenance/convert_dryad_to_bimbam.py b/wqflask/maintenance/convert_dryad_to_bimbam.py
index e833b395..18fbb8a1 100644
--- a/wqflask/maintenance/convert_dryad_to_bimbam.py
+++ b/wqflask/maintenance/convert_dryad_to_bimbam.py
@@ -6,7 +6,6 @@ Convert data dryad files to a BIMBAM _geno and _snps file
"""
-from __future__ import print_function, division, absolute_import
import sys
sys.path.append("..")
@@ -42,7 +41,7 @@ def read_dryad_file(filename):
return geno_rows
- #for i, marker in enumerate(marker_list):
+ # for i, marker in enumerate(marker_list):
# this_row = []
# this_row.append(marker)
# this_row.append("X")
@@ -53,18 +52,21 @@ def read_dryad_file(filename):
# this_row.append(line.split(" ")[i+2])
# print("row: " + str(i))
# geno_rows.append(this_row)
- #
- #return geno_rows
+ #
+ # return geno_rows
+
def write_bimbam_files(geno_rows):
with open('/home/zas1024/cfw_data/CFW_geno.txt', 'w') as geno_fh:
for row in geno_rows:
geno_fh.write(", ".join(row) + "\n")
+
def convert_dryad_to_bimbam(filename):
geno_file_rows = read_dryad_file(filename)
write_bimbam_files(geno_file_rows)
-if __name__=="__main__":
+
+if __name__ == "__main__":
input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt"
- convert_dryad_to_bimbam(input_filename) \ No newline at end of file
+ convert_dryad_to_bimbam(input_filename)
diff --git a/wqflask/maintenance/convert_geno_to_bimbam.py b/wqflask/maintenance/convert_geno_to_bimbam.py
index 528b98cf..078be529 100644
--- a/wqflask/maintenance/convert_geno_to_bimbam.py
+++ b/wqflask/maintenance/convert_geno_to_bimbam.py
@@ -9,7 +9,6 @@ code
"""
-from __future__ import print_function, division, absolute_import
import sys
sys.path.append("..")
import os
@@ -21,9 +20,12 @@ import simplejson as json
from pprint import pformat as pf
-class EmptyConfigurations(Exception): pass
-class Marker(object):
+class EmptyConfigurations(Exception):
+ pass
+
+
+class Marker:
def __init__(self):
self.name = None
self.chr = None
@@ -31,7 +33,8 @@ class Marker(object):
self.Mb = None
self.genotypes = []
-class ConvertGenoFile(object):
+
+class ConvertGenoFile:
def __init__(self, input_file, output_files):
self.input_file = input_file
@@ -53,7 +56,7 @@ class ConvertGenoFile(object):
'@pat': "0",
'@het': "0.5",
'@unk': "NA"
- }
+ }
self.configurations = {}
self.input_fh = open(self.input_file)
@@ -81,13 +84,14 @@ class ConvertGenoFile(object):
genotypes = row_items[2:]
for item_count, genotype in enumerate(genotypes):
if genotype.upper().strip() in self.configurations:
- this_marker.genotypes.append(self.configurations[genotype.upper().strip()])
+ this_marker.genotypes.append(
+ self.configurations[genotype.upper().strip()])
else:
this_marker.genotypes.append("NA")
self.markers.append(this_marker.__dict__)
- self.write_to_bimbam()
+ self.write_to_bimbam()
def write_to_bimbam(self):
with open(self.output_files[0], "w") as geno_fh:
@@ -104,9 +108,11 @@ class ConvertGenoFile(object):
with open(self.output_files[2], "w") as snp_fh:
for marker in self.markers:
if self.mb_exists:
- snp_fh.write(marker['name'] +", " + str(int(float(marker['Mb'])*1000000)) + ", " + marker['chr'] + "\n")
+ snp_fh.write(
+ marker['name'] + ", " + str(int(float(marker['Mb']) * 1000000)) + ", " + marker['chr'] + "\n")
else:
- snp_fh.write(marker['name'] +", " + str(int(float(marker['cM'])*1000000)) + ", " + marker['chr'] + "\n")
+ snp_fh.write(
+ marker['name'] + ", " + str(int(float(marker['cM']) * 1000000)) + ", " + marker['chr'] + "\n")
def get_sample_list(self, row_contents):
self.sample_list = []
@@ -120,7 +126,7 @@ class ConvertGenoFile(object):
self.sample_list = row_contents[3:]
else:
self.sample_list = row_contents[2:]
-
+
def process_rows(self):
for self.latest_row_pos, row in enumerate(self.input_fh):
self.latest_row_value = row
@@ -158,10 +164,14 @@ class ConvertGenoFile(object):
group_name = ".".join(input_file.split('.')[:-1])
if group_name == "HSNIH-Palmer":
continue
- geno_output_file = os.path.join(new_directory, group_name + "_geno.txt")
- pheno_output_file = os.path.join(new_directory, group_name + "_pheno.txt")
- snp_output_file = os.path.join(new_directory, group_name + "_snps.txt")
- output_files = [geno_output_file, pheno_output_file, snp_output_file]
+ geno_output_file = os.path.join(
+ new_directory, group_name + "_geno.txt")
+ pheno_output_file = os.path.join(
+ new_directory, group_name + "_pheno.txt")
+ snp_output_file = os.path.join(
+ new_directory, group_name + "_snps.txt")
+ output_files = [geno_output_file,
+ pheno_output_file, snp_output_file]
print("%s -> %s" % (
os.path.join(old_directory, input_file), geno_output_file))
convertob = ConvertGenoFile(input_file, output_files)
@@ -174,17 +184,18 @@ class ConvertGenoFile(object):
print(" Exception:", why)
print(traceback.print_exc())
print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos,
- convertob.latest_col_pos))
+ convertob.latest_col_pos))
print(" Column is:", convertob.latest_col_value)
print(" Row is:", convertob.latest_row_value)
break
-if __name__=="__main__":
+
+if __name__ == "__main__":
Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype"""
New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/bimbam"""
#Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
#Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
#convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
- #convertob.convert()
+ # convertob.convert()
ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
- #ConvertGenoFiles(Geno_Directory) \ No newline at end of file
+ # ConvertGenoFiles(Geno_Directory)
diff --git a/wqflask/maintenance/gen_select_dataset.py b/wqflask/maintenance/gen_select_dataset.py
index 647e58a2..db65a11f 100644
--- a/wqflask/maintenance/gen_select_dataset.py
+++ b/wqflask/maintenance/gen_select_dataset.py
@@ -30,18 +30,10 @@ It needs to be run manually when database has been changed. Run it as
#
# This module is used by GeneNetwork project (www.genenetwork.org)
-from __future__ import print_function, division
-
-#from flask import config
-#
-#cdict = {}
-#config = config.Config(cdict).from_envvar('WQFLASK_SETTINGS')
-#print("cdict is:", cdict)
-
import sys
# NEW: Note we prepend the current path - otherwise a guix instance of GN2 may be used instead
-sys.path.insert(0,'./')
+sys.path.insert(0, './')
# NEW: import app to avoid a circular dependency on utility.tools
from wqflask import app
@@ -50,7 +42,7 @@ from utility.tools import locate, locate_ignore_error, TEMPDIR, SQL_URI
import MySQLdb
import simplejson as json
-import urlparse
+import urllib.parse
#import sqlalchemy as sa
@@ -63,16 +55,17 @@ from pprint import pformat as pf
#conn = Engine.connect()
+
def parse_db_uri():
"""Converts a database URI to the db name, host name, user name, and password"""
- parsed_uri = urlparse.urlparse(SQL_URI)
+ parsed_uri = urllib.parse.urlparse(SQL_URI)
db_conn_info = dict(
- db = parsed_uri.path[1:],
- host = parsed_uri.hostname,
- user = parsed_uri.username,
- passwd = parsed_uri.password)
+ db=parsed_uri.path[1:],
+ host=parsed_uri.hostname,
+ user=parsed_uri.username,
+ passwd=parsed_uri.password)
print(db_conn_info)
return db_conn_info
@@ -108,7 +101,7 @@ def get_types(groups):
"""Build types list"""
types = {}
#print("Groups: ", pf(groups))
- for species, group_dict in groups.iteritems():
+ for species, group_dict in list(groups.items()):
types[species] = {}
for group_name, _group_full_name in group_dict:
# make group an alias to shorten the code
@@ -127,21 +120,23 @@ def get_types(groups):
else:
if not phenotypes_exist(group_name) and not genotypes_exist(group_name):
types[species].pop(group_name, None)
- groups[species] = tuple(group for group in groups[species] if group[0] != group_name)
- else: #ZS: This whole else statement might be unnecessary, need to check
+ groups[species] = tuple(
+ group for group in groups[species] if group[0] != group_name)
+ else: # ZS: This whole else statement might be unnecessary, need to check
types_list = build_types(species, group_name)
if len(types_list) > 0:
types[species][group_name] = types_list
else:
types[species].pop(group_name, None)
- groups[species] = tuple(group for group in groups[species] if group[0] != group_name)
+ groups[species] = tuple(
+ group for group in groups[species] if group[0] != group_name)
return types
def phenotypes_exist(group_name):
#print("group_name:", group_name)
Cursor.execute("""select Name from PublishFreeze
- where PublishFreeze.Name = '%s'""" % (group_name+"Publish"))
+ where PublishFreeze.Name = '%s'""" % (group_name + "Publish"))
results = Cursor.fetchone()
#print("RESULTS:", results)
@@ -151,10 +146,11 @@ def phenotypes_exist(group_name):
else:
return False
+
def genotypes_exist(group_name):
#print("group_name:", group_name)
Cursor.execute("""select Name from GenoFreeze
- where GenoFreeze.Name = '%s'""" % (group_name+"Geno"))
+ where GenoFreeze.Name = '%s'""" % (group_name + "Geno"))
results = Cursor.fetchone()
#print("RESULTS:", results)
@@ -164,6 +160,7 @@ def genotypes_exist(group_name):
else:
return False
+
def build_types(species, group):
"""Fetches tissues
@@ -192,12 +189,13 @@ def build_types(species, group):
return results
+
def get_datasets(types):
"""Build datasets list"""
datasets = {}
- for species, group_dict in types.iteritems():
+ for species, group_dict in list(types.items()):
datasets[species] = {}
- for group, type_list in group_dict.iteritems():
+ for group, type_list in list(group_dict.items()):
datasets[species][group] = {}
for type_name in type_list:
these_datasets = build_datasets(species, group, type_name[0])
@@ -254,7 +252,7 @@ def build_datasets(species, group, type_name):
dataset_text = "%s Genotypes" % group
datasets.append((dataset_id, dataset_value, dataset_text))
- else: # for mRNA expression/ProbeSet
+ else: # for mRNA expression/ProbeSet
Cursor.execute("""select ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName from
ProbeSetFreeze, ProbeFreeze, InbredSet, Tissue, Species where
Species.Name = '%s' and Species.Id = InbredSet.SpeciesId and
@@ -316,7 +314,8 @@ def _test_it():
datasets = build_datasets("Mouse", "BXD", "Hippocampus")
#print("build_datasets:", pf(datasets))
+
if __name__ == '__main__':
Conn = MySQLdb.Connect(**parse_db_uri())
Cursor = Conn.cursor()
- main() \ No newline at end of file
+ main()
diff --git a/wqflask/maintenance/generate_kinship_from_bimbam.py b/wqflask/maintenance/generate_kinship_from_bimbam.py
index b53f5dda..9f01d094 100644
--- a/wqflask/maintenance/generate_kinship_from_bimbam.py
+++ b/wqflask/maintenance/generate_kinship_from_bimbam.py
@@ -8,20 +8,22 @@ and uses GEMMA to generate their corresponding kinship/relatedness matrix file
"""
-from __future__ import print_function, division, absolute_import
import sys
sys.path.append("..")
import os
import glob
-class GenerateKinshipMatrices(object):
+
+class GenerateKinshipMatrices:
def __init__(self, group_name, geno_file, pheno_file):
self.group_name = group_name
self.geno_file = geno_file
self.pheno_file = pheno_file
-
+
def generate_kinship(self):
- gemma_command = "/gnu/store/xhzgjr0jvakxv6h3blj8z496xjig69b0-profile/bin/gemma -g " + self.geno_file + " -p " + self.pheno_file + " -gk 1 -outdir /home/zas1024/genotype_files/genotype/bimbam/ -o " + self.group_name
+ gemma_command = "/gnu/store/xhzgjr0jvakxv6h3blj8z496xjig69b0-profile/bin/gemma -g " + self.geno_file + \
+ " -p " + self.pheno_file + \
+ " -gk 1 -outdir /home/zas1024/genotype_files/genotype/bimbam/ -o " + self.group_name
print("command:", gemma_command)
os.system(gemma_command)
@@ -34,9 +36,12 @@ class GenerateKinshipMatrices(object):
group_name = ".".join(input_file.split('.')[:-1])
if group_name == "HSNIH-Palmer":
continue
- geno_input_file = os.path.join(bimbam_dir, group_name + "_geno.txt")
- pheno_input_file = os.path.join(bimbam_dir, group_name + "_pheno.txt")
- convertob = GenerateKinshipMatrices(group_name, geno_input_file, pheno_input_file)
+ geno_input_file = os.path.join(
+ bimbam_dir, group_name + "_geno.txt")
+ pheno_input_file = os.path.join(
+ bimbam_dir, group_name + "_pheno.txt")
+ convertob = GenerateKinshipMatrices(
+ group_name, geno_input_file, pheno_input_file)
try:
convertob.generate_kinship()
except EmptyConfigurations as why:
@@ -47,15 +52,15 @@ class GenerateKinshipMatrices(object):
print(" Exception:", why)
print(traceback.print_exc())
print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos,
- convertob.latest_col_pos))
+ convertob.latest_col_pos))
print(" Column is:", convertob.latest_col_value)
print(" Row is:", convertob.latest_row_value)
break
-
-
-if __name__=="__main__":
+
+
+if __name__ == "__main__":
Geno_Directory = """/export/local/home/zas1024/genotype_files/genotype/"""
Bimbam_Directory = """/export/local/home/zas1024/genotype_files/genotype/bimbam/"""
GenerateKinshipMatrices.process_all(Geno_Directory, Bimbam_Directory)
-
- #./gemma -g /home/zas1024/genotype_files/genotype/bimbam/BXD_geno.txt -p /home/zas1024/genotype_files/genotype/bimbam/BXD_pheno.txt -gk 1 -o BXD \ No newline at end of file
+
+ # ./gemma -g /home/zas1024/genotype_files/genotype/bimbam/BXD_geno.txt -p /home/zas1024/genotype_files/genotype/bimbam/BXD_pheno.txt -gk 1 -o BXD
diff --git a/wqflask/maintenance/generate_probesetfreeze_file.py b/wqflask/maintenance/generate_probesetfreeze_file.py
index b7b2dc8e..e964c8ed 100644
--- a/wqflask/maintenance/generate_probesetfreeze_file.py
+++ b/wqflask/maintenance/generate_probesetfreeze_file.py
@@ -1,7 +1,5 @@
#!/usr/bin/python
-from __future__ import absolute_import, print_function, division
-
import sys
# sys.path.insert(0, "..") - why?
@@ -25,10 +23,12 @@ def get_cursor():
cursor = con.cursor()
return cursor
+
def show_progress(process, counter):
if counter % 1000 == 0:
print("{}: {}".format(process, counter))
+
def get_strains(cursor):
cursor.execute("""select Strain.Name
from Strain, StrainXRef, InbredSet
@@ -44,6 +44,7 @@ def get_strains(cursor):
return strains
+
def get_probeset_vals(cursor, dataset_name):
cursor.execute(""" select ProbeSet.Id, ProbeSet.Name
from ProbeSetXRef,
@@ -79,10 +80,11 @@ def get_probeset_vals(cursor, dataset_name):
return probeset_vals
+
def trim_strains(strains, probeset_vals):
trimmed_strains = []
#print("probeset_vals is:", pf(probeset_vals))
- first_probeset = list(probeset_vals.itervalues())[0]
+ first_probeset = list(probeset_vals.values())[0]
print("\n**** first_probeset is:", pf(first_probeset))
for strain in strains:
print("\n**** strain is:", pf(strain))
@@ -91,6 +93,7 @@ def trim_strains(strains, probeset_vals):
print("trimmed_strains:", pf(trimmed_strains))
return trimmed_strains
+
def write_data_matrix_file(strains, probeset_vals, filename):
with open(filename, "wb") as fh:
csv_writer = csv.writer(fh, delimiter=",", quoting=csv.QUOTE_ALL)
@@ -105,10 +108,12 @@ def write_data_matrix_file(strains, probeset_vals, filename):
csv_writer.writerow(row_data)
show_progress("Writing", counter)
+
def main():
- filename = os.path.expanduser("~/gene/wqflask/maintenance/" +
- "ProbeSetFreezeId_210_FullName_Eye_AXBXA_Illumina_V6.2" +
- "(Oct08)_RankInv_Beta.txt")
+ filename = os.path.expanduser(
+ "~/gene/wqflask/maintenance/"
+ "ProbeSetFreezeId_210_FullName_Eye_AXBXA_Illumina_V6.2"
+ "(Oct08)_RankInv_Beta.txt")
dataset_name = "Eye_AXBXA_1008_RankInv"
cursor = get_cursor()
@@ -119,5 +124,6 @@ def main():
trimmed_strains = trim_strains(strains, probeset_vals)
write_data_matrix_file(trimmed_strains, probeset_vals, filename)
+
if __name__ == '__main__':
main()
diff --git a/wqflask/maintenance/geno_to_json.py b/wqflask/maintenance/geno_to_json.py
index 9579812a..32e0e34b 100644
--- a/wqflask/maintenance/geno_to_json.py
+++ b/wqflask/maintenance/geno_to_json.py
@@ -9,7 +9,6 @@ code
"""
-from __future__ import print_function, division, absolute_import
import sys
sys.path.append("..")
import os
@@ -26,11 +25,12 @@ from pprint import pformat as pf
#from utility.tools import flat_files
-class EmptyConfigurations(Exception): pass
-
+class EmptyConfigurations(Exception):
+ pass
-class Marker(object):
+
+class Marker:
def __init__(self):
self.name = None
self.chr = None
@@ -38,23 +38,24 @@ class Marker(object):
self.Mb = None
self.genotypes = []
-class ConvertGenoFile(object):
+
+class ConvertGenoFile:
def __init__(self, input_file, output_file):
-
+
self.input_file = input_file
self.output_file = output_file
-
+
self.mb_exists = False
self.cm_exists = False
self.markers = []
-
+
self.latest_row_pos = None
self.latest_col_pos = None
-
+
self.latest_row_value = None
self.latest_col_value = None
-
+
def convert(self):
self.haplotype_notation = {
@@ -62,24 +63,23 @@ class ConvertGenoFile(object):
'@pat': "0",
'@het': "0.5",
'@unk': "NA"
- }
-
+ }
+
self.configurations = {}
#self.skipped_cols = 3
-
- #if self.input_file.endswith(".geno.gz"):
+
+ # if self.input_file.endswith(".geno.gz"):
# print("self.input_file: ", self.input_file)
# self.input_fh = gzip.open(self.input_file)
- #else:
+ # else:
self.input_fh = open(self.input_file)
-
+
with open(self.output_file, "w") as self.output_fh:
- #if self.file_type == "geno":
+ # if self.file_type == "geno":
self.process_csv()
- #elif self.file_type == "snps":
+ # elif self.file_type == "snps":
# self.process_snps_file()
-
def process_csv(self):
for row_count, row in enumerate(self.process_rows()):
row_items = row.split("\t")
@@ -101,31 +101,31 @@ class ConvertGenoFile(object):
genotypes = row_items[2:]
for item_count, genotype in enumerate(genotypes):
if genotype.upper() in self.configurations:
- this_marker.genotypes.append(self.configurations[genotype.upper()])
+ this_marker.genotypes.append(
+ self.configurations[genotype.upper()])
else:
this_marker.genotypes.append("NA")
-
- #print("this_marker is:", pf(this_marker.__dict__))
- #if this_marker.chr == "14":
+
+ #print("this_marker is:", pf(this_marker.__dict__))
+ # if this_marker.chr == "14":
self.markers.append(this_marker.__dict__)
with open(self.output_file, 'w') as fh:
json.dump(self.markers, fh, indent=" ", sort_keys=True)
-
- # print('configurations:', str(configurations))
- #self.latest_col_pos = item_count + self.skipped_cols
- #self.latest_col_value = item
-
- #if item_count != 0:
- # self.output_fh.write(" ")
- #self.output_fh.write(self.configurations[item.upper()])
-
- #self.output_fh.write("\n")
+ # print('configurations:', str(configurations))
+ #self.latest_col_pos = item_count + self.skipped_cols
+ #self.latest_col_value = item
+
+ # if item_count != 0:
+ # self.output_fh.write(" ")
+ # self.output_fh.write(self.configurations[item.upper()])
+
+ # self.output_fh.write("\n")
def process_rows(self):
for self.latest_row_pos, row in enumerate(self.input_fh):
- #if self.input_file.endswith(".geno.gz"):
+ # if self.input_file.endswith(".geno.gz"):
# print("row: ", row)
self.latest_row_value = row
# Take care of headers
@@ -172,26 +172,25 @@ class ConvertGenoFile(object):
print(" Exception:", why)
print(traceback.print_exc())
print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos,
- convertob.latest_col_pos))
+ convertob.latest_col_pos))
print(" Column is:", convertob.latest_col_value)
print(" Row is:", convertob.latest_row_value)
break
-
- #def process_snps_file(cls, snps_file, new_directory):
+
+ # def process_snps_file(cls, snps_file, new_directory):
# output_file = os.path.join(new_directory, "mouse_families.json")
# print("%s -> %s" % (snps_file, output_file))
# convertob = ConvertGenoFile(input_file, output_file)
-
-if __name__=="__main__":
+if __name__ == "__main__":
Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype"""
New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/json"""
#Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
#Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
#convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
- #convertob.convert()
+ # convertob.convert()
ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
- #ConvertGenoFiles(Geno_Directory)
-
- #process_csv(Input_File, Output_File) \ No newline at end of file
+ # ConvertGenoFiles(Geno_Directory)
+
+ #process_csv(Input_File, Output_File)
diff --git a/wqflask/maintenance/get_group_samplelists.py b/wqflask/maintenance/get_group_samplelists.py
index fb22898a..0a450d3f 100644
--- a/wqflask/maintenance/get_group_samplelists.py
+++ b/wqflask/maintenance/get_group_samplelists.py
@@ -1,17 +1,17 @@
-from __future__ import absolute_import, print_function, division
-
import os
import glob
import gzip
from base import webqtlConfig
+
def get_samplelist(file_type, geno_file):
if file_type == "geno":
return get_samplelist_from_geno(geno_file)
elif file_type == "plink":
return get_samplelist_from_plink(geno_file)
+
def get_samplelist_from_geno(genofilename):
if os.path.isfile(genofilename + '.gz'):
genofilename += '.gz'
@@ -35,6 +35,7 @@ def get_samplelist_from_geno(genofilename):
samplelist = headers[3:]
return samplelist
+
def get_samplelist_from_plink(genofilename):
genofile = open(genofilename)
diff --git a/wqflask/maintenance/print_benchmark.py b/wqflask/maintenance/print_benchmark.py
index ae327cf3..9d12da8a 100644
--- a/wqflask/maintenance/print_benchmark.py
+++ b/wqflask/maintenance/print_benchmark.py
@@ -1,13 +1,11 @@
#!/usr/bin/python
-from __future__ import absolute_import, print_function, division
-
import time
from pprint import pformat as pf
-class TheCounter(object):
+class TheCounter:
Counters = {}
def __init__(self):
@@ -17,15 +15,18 @@ class TheCounter(object):
self.time_took = time.time() - start_time
TheCounter.Counters[self.__class__.__name__] = self.time_took
+
class PrintAll(TheCounter):
def print_it(self, counter):
print(counter)
+
class PrintSome(TheCounter):
def print_it(self, counter):
if counter % 1000 == 0:
print(counter)
+
class PrintNone(TheCounter):
def print_it(self, counter):
pass
@@ -39,5 +40,6 @@ def new_main():
print(pf(TheCounter.Counters))
+
if __name__ == '__main__':
- new_main() \ No newline at end of file
+ new_main()
diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py
index 41a3aad8..0cc963e5 100644
--- a/wqflask/maintenance/quantile_normalize.py
+++ b/wqflask/maintenance/quantile_normalize.py
@@ -1,12 +1,7 @@
-from __future__ import absolute_import, print_function, division
-
import sys
-sys.path.insert(0,'./')
-
-from itertools import izip
-
+sys.path.insert(0, './')
import MySQLdb
-import urlparse
+import urllib.parse
import numpy as np
import pandas as pd
@@ -19,48 +14,54 @@ from wqflask import app
from utility.elasticsearch_tools import get_elasticsearch_connection
from utility.tools import ELASTICSEARCH_HOST, ELASTICSEARCH_PORT, SQL_URI
+
def parse_db_uri():
"""Converts a database URI to the db name, host name, user name, and password"""
- parsed_uri = urlparse.urlparse(SQL_URI)
+ parsed_uri = urllib.parse.urlparse(SQL_URI)
db_conn_info = dict(
- db = parsed_uri.path[1:],
- host = parsed_uri.hostname,
- user = parsed_uri.username,
- passwd = parsed_uri.password)
+ db=parsed_uri.path[1:],
+ host=parsed_uri.hostname,
+ user=parsed_uri.username,
+ passwd=parsed_uri.password)
print(db_conn_info)
return db_conn_info
+
def create_dataframe(input_file):
with open(input_file) as f:
ncols = len(f.readline().split("\t"))
- input_array = np.loadtxt(open(input_file, "rb"), delimiter="\t", skiprows=1, usecols=range(1, ncols))
+ input_array = np.loadtxt(open(
+ input_file, "rb"), delimiter="\t", skiprows=1, usecols=list(range(1, ncols)))
return pd.DataFrame(input_array)
-#This function taken from https://github.com/ShawnLYU/Quantile_Normalize
+# This function taken from https://github.com/ShawnLYU/Quantile_Normalize
+
+
def quantileNormalize(df_input):
df = df_input.copy()
- #compute rank
+ # compute rank
dic = {}
for col in df:
- dic.update({col : sorted(df[col])})
+ dic.update({col: sorted(df[col])})
sorted_df = pd.DataFrame(dic)
- rank = sorted_df.mean(axis = 1).tolist()
- #sort
+ rank = sorted_df.mean(axis=1).tolist()
+ # sort
for col in df:
t = np.searchsorted(np.sort(df[col]), df[col])
df[col] = [rank[i] for i in t]
return df
+
def set_data(dataset_name):
orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt"
sample_list = []
with open(orig_file, 'r') as orig_fh, open('/home/zas1024/cfw_data/quant_norm.csv', 'r') as quant_fh:
- for i, (line1, line2) in enumerate(izip(orig_fh, quant_fh)):
+ for i, (line1, line2) in enumerate(zip(orig_fh, quant_fh)):
trait_dict = {}
sample_list = []
if i == 0:
@@ -69,10 +70,10 @@ def set_data(dataset_name):
trait_name = line1.split('\t')[0]
for i, sample in enumerate(sample_names):
this_sample = {
- "name": sample,
- "value": line1.split('\t')[i+1],
- "qnorm": line2.split('\t')[i+1]
- }
+ "name": sample,
+ "value": line1.split('\t')[i + 1],
+ "qnorm": line2.split('\t')[i + 1]
+ }
sample_list.append(this_sample)
query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName
FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet
@@ -100,13 +101,14 @@ def set_data(dataset_name):
}
}
+
if __name__ == '__main__':
Conn = MySQLdb.Connect(**parse_db_uri())
Cursor = Conn.cursor()
- #es = Elasticsearch([{
+ # es = Elasticsearch([{
# "host": ELASTICSEARCH_HOST, "port": ELASTICSEARCH_PORT
- #}], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None
+ # }], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None
es = get_elasticsearch_connection(for_user=False)
@@ -121,9 +123,9 @@ if __name__ == '__main__':
success, _ = bulk(es, set_data(sys.argv[1]))
response = es.search(
- index = "traits", doc_type = "trait", body = {
- "query": { "match": { "name": "ENSMUSG00000028982" } }
+ index="traits", doc_type="trait", body={
+ "query": {"match": {"name": "ENSMUSG00000028982"}}
}
)
- print(response) \ No newline at end of file
+ print(response)
diff --git a/wqflask/maintenance/set_resource_defaults.py b/wqflask/maintenance/set_resource_defaults.py
new file mode 100644
index 00000000..0f472494
--- /dev/null
+++ b/wqflask/maintenance/set_resource_defaults.py
@@ -0,0 +1,171 @@
+"""
+
+Script that sets default resource access masks for use with the DB proxy
+
+Defaults will be:
+Owner - omni_gn
+Mask - Public/non-confidential: { data: "view",
+ metadata: "view",
+ admin: "not-admin" }
+ Private/confidentia: { data: "no-access",
+ metadata: "no-access",
+ admin: "not-admin" }
+
+To run:
+./bin/genenetwork2 ~/my_settings.py -c ./wqflask/maintenance/gen_select_dataset.py
+
+"""
+
+import sys
+import json
+
+# NEW: Note we prepend the current path - otherwise a guix instance of GN2 may be used instead
+sys.path.insert(0, './')
+
+# NEW: import app to avoid a circular dependency on utility.tools
+from wqflask import app
+
+from utility import hmac
+from utility.tools import SQL_URI
+from utility.redis_tools import get_redis_conn, get_user_id, add_resource, get_resources, get_resource_info
+Redis = get_redis_conn()
+
+import MySQLdb
+
+import urllib.parse
+
+from utility.logger import getLogger
+logger = getLogger(__name__)
+
+
+def parse_db_uri():
+ """Converts a database URI to the db name, host name, user name, and password"""
+
+ parsed_uri = urllib.parse.urlparse(SQL_URI)
+
+ db_conn_info = dict(
+ db=parsed_uri.path[1:],
+ host=parsed_uri.hostname,
+ user=parsed_uri.username,
+ passwd=parsed_uri.password)
+
+ print(db_conn_info)
+ return db_conn_info
+
+
+def insert_probeset_resources(default_owner_id):
+ current_resources = Redis.hgetall("resources")
+ Cursor.execute(""" SELECT
+ ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.confidentiality, ProbeSetFreeze.public
+ FROM
+ ProbeSetFreeze""")
+
+ resource_results = Cursor.fetchall()
+ for i, resource in enumerate(resource_results):
+ resource_ob = {}
+ resource_ob['name'] = resource[1]
+ resource_ob['owner_id'] = default_owner_id
+ resource_ob['data'] = {"dataset": str(resource[0])}
+ resource_ob['type'] = "dataset-probeset"
+ if resource[2] < 1 and resource[3] > 0:
+ resource_ob['default_mask'] = {"data": "view",
+ "metadata": "view",
+ "admin": "not-admin"}
+ else:
+ resource_ob['default_mask'] = {"data": "no-access",
+ "metadata": "no-access",
+ "admin": "not-admin"}
+ resource_ob['group_masks'] = {}
+
+ add_resource(resource_ob, update=False)
+
+
+def insert_publish_resources(default_owner_id):
+ current_resources = Redis.hgetall("resources")
+ Cursor.execute(""" SELECT
+ PublishXRef.Id, PublishFreeze.Id, InbredSet.InbredSetCode
+ FROM
+ PublishXRef, PublishFreeze, InbredSet, Publication
+ WHERE
+ PublishFreeze.InbredSetId = PublishXRef.InbredSetId AND
+ InbredSet.Id = PublishXRef.InbredSetId AND
+ Publication.Id = PublishXRef.PublicationId""")
+
+ resource_results = Cursor.fetchall()
+ for resource in resource_results:
+ if resource[2]:
+ resource_ob = {}
+ if resource[2]:
+ resource_ob['name'] = resource[2] + "_" + str(resource[0])
+ else:
+ resource_ob['name'] = str(resource[0])
+ resource_ob['owner_id'] = default_owner_id
+ resource_ob['data'] = {"dataset": str(resource[1]),
+ "trait": str(resource[0])}
+ resource_ob['type'] = "dataset-publish"
+ resource_ob['default_mask'] = {"data": "view",
+ "metadata": "view",
+ "admin": "not-admin"}
+
+ resource_ob['group_masks'] = {}
+
+ add_resource(resource_ob, update=False)
+ else:
+ continue
+
+
+def insert_geno_resources(default_owner_id):
+ current_resources = Redis.hgetall("resources")
+ Cursor.execute(""" SELECT
+ GenoFreeze.Id, GenoFreeze.ShortName, GenoFreeze.confidentiality
+ FROM
+ GenoFreeze""")
+
+ resource_results = Cursor.fetchall()
+ for i, resource in enumerate(resource_results):
+ resource_ob = {}
+ resource_ob['name'] = resource[1]
+ if resource[1] == "HET3-ITPGeno":
+ resource_ob['owner_id'] = "c5ce8c56-78a6-474f-bcaf-7129d97f56ae"
+ else:
+ resource_ob['owner_id'] = default_owner_id
+ resource_ob['data'] = {"dataset": str(resource[0])}
+ resource_ob['type'] = "dataset-geno"
+ if resource[2] < 1:
+ resource_ob['default_mask'] = {"data": "view",
+ "metadata": "view",
+ "admin": "not-admin"}
+ else:
+ resource_ob['default_mask'] = {"data": "no-access",
+ "metadata": "no-access",
+ "admin": "not-admin"}
+ resource_ob['group_masks'] = {}
+
+ add_resource(resource_ob, update=False)
+
+
+def insert_resources(default_owner_id):
+ current_resources = get_resources()
+ print("START")
+ insert_publish_resources(default_owner_id)
+ print("AFTER PUBLISH")
+ insert_geno_resources(default_owner_id)
+ print("AFTER GENO")
+ insert_probeset_resources(default_owner_id)
+ print("AFTER PROBESET")
+
+
+def main():
+ """Generates and outputs (as json file) the data for the main dropdown menus on the home page"""
+
+ Redis.delete("resources")
+
+ owner_id = "c5ce8c56-78a6-474f-bcaf-7129d97f56ae"
+
+ insert_resources(owner_id)
+
+
+if __name__ == '__main__':
+ Conn = MySQLdb.Connect(**parse_db_uri())
+ Cursor = Conn.cursor()
+ main()