aboutsummaryrefslogtreecommitdiff
path: root/gn2/maintenance
diff options
context:
space:
mode:
authorArun Isaac2023-12-29 18:55:37 +0000
committerArun Isaac2023-12-29 19:01:46 +0000
commit204a308be0f741726b9a620d88fbc22b22124c81 (patch)
treeb3cf66906674020b530c844c2bb4982c8a0e2d39 /gn2/maintenance
parent83062c75442160427b50420161bfcae2c5c34c84 (diff)
downloadgenenetwork2-204a308be0f741726b9a620d88fbc22b22124c81.tar.gz
Namespace all modules under gn2.
We move all modules under a gn2 directory. This is important for "correct" packaging and deployment as a Guix service.
Diffstat (limited to 'gn2/maintenance')
-rw-r--r--gn2/maintenance/README.md4
-rw-r--r--gn2/maintenance/__init__.py0
-rw-r--r--gn2/maintenance/convert_dryad_to_bimbam.py72
-rw-r--r--gn2/maintenance/convert_geno_to_bimbam.py201
-rw-r--r--gn2/maintenance/gen_ind_genofiles.py253
-rw-r--r--gn2/maintenance/gen_select_dataset.py296
-rw-r--r--gn2/maintenance/generate_kinship_from_bimbam.py66
-rw-r--r--gn2/maintenance/generate_probesetfreeze_file.py122
-rw-r--r--gn2/maintenance/geno_to_json.py196
-rw-r--r--gn2/maintenance/get_group_samplelists.py47
-rw-r--r--gn2/maintenance/print_benchmark.py45
-rw-r--r--gn2/maintenance/quantile_normalize.py98
-rw-r--r--gn2/maintenance/set_resource_defaults.py153
13 files changed, 1553 insertions, 0 deletions
diff --git a/gn2/maintenance/README.md b/gn2/maintenance/README.md
new file mode 100644
index 00000000..873eaa32
--- /dev/null
+++ b/gn2/maintenance/README.md
@@ -0,0 +1,4 @@
+Maintenance files have been moved into a separate repository named
+*gn_extra*. See https://github.com/genenetwork/gn_extra
+
+
diff --git a/gn2/maintenance/__init__.py b/gn2/maintenance/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/gn2/maintenance/__init__.py
diff --git a/gn2/maintenance/convert_dryad_to_bimbam.py b/gn2/maintenance/convert_dryad_to_bimbam.py
new file mode 100644
index 00000000..18fbb8a1
--- /dev/null
+++ b/gn2/maintenance/convert_dryad_to_bimbam.py
@@ -0,0 +1,72 @@
+#!/usr/bin/python
+
+"""
+Convert data dryad files to a BIMBAM _geno and _snps file
+
+
+"""
+
+import sys
+sys.path.append("..")
+
+
+def read_dryad_file(filename):
+ exclude_count = 0
+ marker_list = []
+ sample_dict = {}
+ sample_list = []
+ geno_rows = []
+ with open(filename, 'r') as the_file:
+ for i, line in enumerate(the_file):
+ if i > 0:
+ if line.split(" ")[1] == "no":
+ sample_name = line.split(" ")[0]
+ sample_list.append(sample_name)
+ sample_dict[sample_name] = line.split(" ")[2:]
+ else:
+ exclude_count += 1
+ else:
+ marker_list = line.split(" ")[2:]
+
+ for i, marker in enumerate(marker_list):
+ this_row = []
+ this_row.append(marker)
+ this_row.append("X")
+ this_row.append("Y")
+ for sample in sample_list:
+ this_row.append(sample_dict[sample][i])
+ geno_rows.append(this_row)
+
+ print(exclude_count)
+
+ return geno_rows
+
+ # for i, marker in enumerate(marker_list):
+ # this_row = []
+ # this_row.append(marker)
+ # this_row.append("X")
+ # this_row.append("Y")
+ # with open(filename, 'r') as the_file:
+ # for j, line in enumerate(the_file):
+ # if j > 0:
+ # this_row.append(line.split(" ")[i+2])
+ # print("row: " + str(i))
+ # geno_rows.append(this_row)
+ #
+ # return geno_rows
+
+
+def write_bimbam_files(geno_rows):
+ with open('/home/zas1024/cfw_data/CFW_geno.txt', 'w') as geno_fh:
+ for row in geno_rows:
+ geno_fh.write(", ".join(row) + "\n")
+
+
+def convert_dryad_to_bimbam(filename):
+ geno_file_rows = read_dryad_file(filename)
+ write_bimbam_files(geno_file_rows)
+
+
+if __name__ == "__main__":
+ input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt"
+ convert_dryad_to_bimbam(input_filename)
diff --git a/gn2/maintenance/convert_geno_to_bimbam.py b/gn2/maintenance/convert_geno_to_bimbam.py
new file mode 100644
index 00000000..078be529
--- /dev/null
+++ b/gn2/maintenance/convert_geno_to_bimbam.py
@@ -0,0 +1,201 @@
+#!/usr/bin/python
+
+"""
+Convert .geno files to json
+
+This file goes through all of the genofiles in the genofile directory (.geno)
+and converts them to json files that are used when running the marker regression
+code
+
+"""
+
+import sys
+sys.path.append("..")
+import os
+import glob
+import traceback
+import gzip
+
+import simplejson as json
+
+from pprint import pformat as pf
+
+
+class EmptyConfigurations(Exception):
+ pass
+
+
+class Marker:
+ def __init__(self):
+ self.name = None
+ self.chr = None
+ self.cM = None
+ self.Mb = None
+ self.genotypes = []
+
+
+class ConvertGenoFile:
+
+ def __init__(self, input_file, output_files):
+ self.input_file = input_file
+ self.output_files = output_files
+
+ self.mb_exists = False
+ self.cm_exists = False
+ self.markers = []
+
+ self.latest_row_pos = None
+ self.latest_col_pos = None
+
+ self.latest_row_value = None
+ self.latest_col_value = None
+
+ def convert(self):
+ self.haplotype_notation = {
+ '@mat': "1",
+ '@pat': "0",
+ '@het': "0.5",
+ '@unk': "NA"
+ }
+
+ self.configurations = {}
+ self.input_fh = open(self.input_file)
+
+ self.process_csv()
+
+ def process_csv(self):
+ for row in self.process_rows():
+ row_items = row.split("\t")
+
+ this_marker = Marker()
+ this_marker.name = row_items[1]
+ this_marker.chr = row_items[0]
+ if self.cm_exists and self.mb_exists:
+ this_marker.cM = row_items[2]
+ this_marker.Mb = row_items[3]
+ genotypes = row_items[4:]
+ elif self.cm_exists:
+ this_marker.cM = row_items[2]
+ genotypes = row_items[3:]
+ elif self.mb_exists:
+ this_marker.Mb = row_items[2]
+ genotypes = row_items[3:]
+ else:
+ genotypes = row_items[2:]
+ for item_count, genotype in enumerate(genotypes):
+ if genotype.upper().strip() in self.configurations:
+ this_marker.genotypes.append(
+ self.configurations[genotype.upper().strip()])
+ else:
+ this_marker.genotypes.append("NA")
+
+ self.markers.append(this_marker.__dict__)
+
+ self.write_to_bimbam()
+
+ def write_to_bimbam(self):
+ with open(self.output_files[0], "w") as geno_fh:
+ for marker in self.markers:
+ geno_fh.write(marker['name'])
+ geno_fh.write(", X, Y")
+ geno_fh.write(", " + ", ".join(marker['genotypes']))
+ geno_fh.write("\n")
+
+ with open(self.output_files[1], "w") as pheno_fh:
+ for sample in self.sample_list:
+ pheno_fh.write("1\n")
+
+ with open(self.output_files[2], "w") as snp_fh:
+ for marker in self.markers:
+ if self.mb_exists:
+ snp_fh.write(
+ marker['name'] + ", " + str(int(float(marker['Mb']) * 1000000)) + ", " + marker['chr'] + "\n")
+ else:
+ snp_fh.write(
+ marker['name'] + ", " + str(int(float(marker['cM']) * 1000000)) + ", " + marker['chr'] + "\n")
+
+ def get_sample_list(self, row_contents):
+ self.sample_list = []
+ if self.mb_exists:
+ if self.cm_exists:
+ self.sample_list = row_contents[4:]
+ else:
+ self.sample_list = row_contents[3:]
+ else:
+ if self.cm_exists:
+ self.sample_list = row_contents[3:]
+ else:
+ self.sample_list = row_contents[2:]
+
+ def process_rows(self):
+ for self.latest_row_pos, row in enumerate(self.input_fh):
+ self.latest_row_value = row
+ # Take care of headers
+ if not row.strip():
+ continue
+ if row.startswith('#'):
+ continue
+ if row.startswith('Chr'):
+ if 'Mb' in row.split():
+ self.mb_exists = True
+ if 'cM' in row.split():
+ self.cm_exists = True
+ self.get_sample_list(row.split())
+ continue
+ if row.startswith('@'):
+ key, _separater, value = row.partition(':')
+ key = key.strip()
+ value = value.strip()
+ if key == "@filler":
+ raise EmptyConfigurations
+ if key in self.haplotype_notation:
+ self.configurations[value] = self.haplotype_notation[key]
+ continue
+ if not len(self.configurations):
+ raise EmptyConfigurations
+ yield row
+
+ @classmethod
+ def process_all(cls, old_directory, new_directory):
+ os.chdir(old_directory)
+ for input_file in glob.glob("*"):
+ if not input_file.endswith(('geno', '.geno.gz')):
+ continue
+ group_name = ".".join(input_file.split('.')[:-1])
+ if group_name == "HSNIH-Palmer":
+ continue
+ geno_output_file = os.path.join(
+ new_directory, group_name + "_geno.txt")
+ pheno_output_file = os.path.join(
+ new_directory, group_name + "_pheno.txt")
+ snp_output_file = os.path.join(
+ new_directory, group_name + "_snps.txt")
+ output_files = [geno_output_file,
+ pheno_output_file, snp_output_file]
+ print("%s -> %s" % (
+ os.path.join(old_directory, input_file), geno_output_file))
+ convertob = ConvertGenoFile(input_file, output_files)
+ try:
+ convertob.convert()
+ except EmptyConfigurations as why:
+ print(" No config info? Continuing...")
+ continue
+ except Exception as why:
+ print(" Exception:", why)
+ print(traceback.print_exc())
+ print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos,
+ convertob.latest_col_pos))
+ print(" Column is:", convertob.latest_col_value)
+ print(" Row is:", convertob.latest_row_value)
+ break
+
+
+if __name__ == "__main__":
+ Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype"""
+ New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/bimbam"""
+ #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
+ #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
+ #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
+ # convertob.convert()
+ ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
+ # ConvertGenoFiles(Geno_Directory)
diff --git a/gn2/maintenance/gen_ind_genofiles.py b/gn2/maintenance/gen_ind_genofiles.py
new file mode 100644
index 00000000..b755c648
--- /dev/null
+++ b/gn2/maintenance/gen_ind_genofiles.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""A script that generates the genotype files for groups of individuals, using an existing strain genotype file as a basis
+
+Example commands:
+python3 gen_ind_genofiles.py
+ /home/zas1024/gn2-zach/genotype_files/genotype/
+ /home/zas1024/gn2-zach/new_geno/
+ BXD-Micturition.geno
+ BXD.json
+python3 gen_ind_genofiles.py
+ /home/zas1024/gn2-zach/genotype_files/genotype
+ /home/zas1024/gn2-zach/new_geno/
+ BXD-Micturition.geno
+ BXD.2.geno BXD.4.geno BXD.5.geno
+
+"""
+
+import json
+import os
+import sys
+from typing import List
+
+import MySQLdb
+
+def conn():
+ return MySQLdb.Connect(db=os.environ.get("DB_NAME"),
+ user=os.environ.get("DB_USER"),
+ passwd=os.environ.get("DB_PASS"),
+ host=os.environ.get("DB_HOST"))
+
+def main(args):
+
+ # Directory in which .geno files are located
+ geno_dir = args[1]
+
+ # Directory in which to output new files
+ out_dir = args[2]
+
+ # The individuals group that we want to generate a .geno file for
+ target_file = geno_dir + args[3]
+
+ # The source group(s) we're generating the .geno files from
+ # This can be passed as either a specific .geno file (or set of files as multiple arguments),
+ # or as a JSON file containing a set of .geno files (and their corresponding file names and sample lists)
+ geno_json = {}
+ source_files = []
+ if ".json" in args[4]:
+ geno_json = json.load(open(geno_dir + args[4], "r"))
+ par_f1s = {
+ "mat": geno_json['mat'],
+ "pat": geno_json['pat'],
+ "f1s": geno_json['f1s']
+ }
+
+ # List of file titles and locations from JSON
+ source_files = [{'title': genofile['title'], 'location': geno_dir + genofile['location']} for genofile in geno_json['genofile']]
+ else:
+ par_f1s = {}
+ # List of files directly taken from command line arguments, with titles just set to the filename
+ for group in args[4:]:
+ file_name = geno_dir + group + ".geno" if ".geno" not in group else geno_dir + group
+ source_files.append({'title': file_name[:-5], 'location': file_name})
+
+ if len(source_files) > 1:
+ # Generate a JSON file pointing to the new target genotype files, in situations where there are multiple source .geno files
+ target_json_loc = out_dir + ".".join(args[3].split(".")[:-1]) + ".json"
+ target_json = {'genofile': []}
+
+ # Generate the output .geno files
+ for source_file in source_files:
+ filename, samples = generate_new_genofile(source_file['location'], target_file, par_f1s, out_dir)
+
+ target_json['genofile'].append({
+ 'location': filename.split("/")[-1],
+ 'title': source_file['title'],
+ 'sample_list': samples
+ })
+
+ json.dump(target_json, open(target_json_loc, "w"))
+ else:
+ filename, samples = generate_new_genofile(source_files[0]['location'], target_file, par_f1s, out_dir)
+
+def get_strain_for_sample(sample):
+ query = (
+ "SELECT CaseAttributeXRefNew.Value "
+ "FROM CaseAttributeXRefNew, Strain "
+ "WHERE CaseAttributeXRefNew.CaseAttributeId=11 "
+ "AND CaseAttributeXRefNew.StrainId = Strain.Id "
+ "AND Strain.Name = %(name)s" )
+
+ with conn().cursor() as cursor:
+ cursor.execute(query, {"name": sample.strip()})
+ strain = cursor.fetchone()[0]
+ return strain
+
+def generate_new_genofile(source_genofile, target_genofile, par_f1s, out_dir):
+ source_samples = group_samples(source_genofile)
+ source_genotypes = strain_genotypes(source_genofile)
+ target_samples = group_samples(target_genofile)
+ strain_pos_map = map_strain_pos_to_target_group(source_samples, target_samples, par_f1s)
+
+ if len(source_genofile.split("/")[-1].split(".")) > 2:
+ # The number in the source genofile; for example 4 in BXD.4.geno
+ source_num = source_genofile.split("/")[-1].split(".")[-2]
+ target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + "." + source_num + ".geno"
+ else:
+ target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + ".geno"
+
+ file_location = out_dir + target_filename
+
+ with open(file_location, "w") as fh:
+ for metadata in ["name", "type", "mat", "pat", "het", "unk"]:
+ fh.write("@" + metadata + ":" + source_genotypes[metadata] + "\n")
+
+ header_line = ["Chr", "Locus", "cM", "Mb"] + target_samples
+ fh.write("\t".join(header_line) + "\n")
+
+ for marker in source_genotypes['markers']:
+ line_items = [
+ marker['Chr'],
+ marker['Locus'],
+ marker['cM'],
+ marker['Mb']
+ ]
+
+ for pos in strain_pos_map:
+ if isinstance(pos, int):
+ line_items.append(marker['genotypes'][pos])
+ else:
+ if pos in ["mat", "pat"]:
+ line_items.append(source_genotypes[pos])
+ elif pos == "f1s":
+ line_items.append("H")
+ else:
+ line_items.append("U")
+
+ fh.write("\t".join(line_items) + "\n")
+
+ return file_location, target_samples
+
+def map_strain_pos_to_target_group(source_samples, target_samples, par_f1s):
+ """
+ Retrieve corresponding strain position for each sample in the target group
+
+ This is so the genotypes from the base genofile can be mapped to the samples in the target group
+
+ For example:
+ Base strains: BXD1, BXD2, BXD3
+ Target samples: BXD1_1, BXD1_2, BXD2_1, BXD3_1, BXD3_2, BXD3_3
+ Returns: [0, 0, 1, 2, 2, 2]
+ """
+ pos_map = []
+ for sample in target_samples:
+ sample_strain = get_strain_for_sample(sample)
+ if sample_strain in source_samples:
+ pos_map.append(source_samples.index(sample_strain))
+ else:
+ val = "U"
+ for key in par_f1s.keys():
+ if sample_strain in par_f1s[key]:
+ val = key
+ pos_map.append(val)
+
+ return pos_map
+
+def group_samples(target_file: str) -> List:
+ """
+ Get the group samples from its "dummy" .geno file (which still contains the sample list)
+ """
+
+ sample_list = []
+ with open(target_file, "r") as target_geno:
+ for i, line in enumerate(target_geno):
+ # Skip header lines
+ if line[0] in ["#", "@"] or not len(line):
+ continue
+
+ line_items = line.split()
+
+ sample_list = [item for item in line_items if item not in ["Chr", "Locus", "Mb", "cM"]]
+ break
+
+ return sample_list
+
+def strain_genotypes(strain_genofile: str) -> List:
+ """
+ Read genotypes from source strain .geno file
+
+ :param strain_genofile: string of genofile filename
+ :return: a list of dictionaries representing each marker's genotypes
+
+ Example output: [
+ {
+ 'Chr': '1',
+ 'Locus': 'marker1',
+ 'Mb': '10.0',
+ 'cM': '8.0',
+ 'genotypes': [('BXD1', 'B'), ('BXD2', 'D'), ('BXD3', 'H'), ...]
+ },
+ ...
+ ]
+ """
+
+ geno_dict = {}
+
+ geno_start_col = None
+ header_columns = []
+ sample_list = []
+ markers = []
+ with open(strain_genofile, "r") as source_geno:
+ for i, line in enumerate(source_geno):
+ if line[0] == "@":
+ metadata_type = line[1:].split(":")[0]
+ if metadata_type in ['name', 'type', 'mat', 'pat', 'het', 'unk']:
+ geno_dict[metadata_type] = line.split(":")[1].strip()
+
+ continue
+
+ # Skip other header lines
+ if line[0] == "#" or not len(line):
+ continue
+
+ line_items = line.split("\t")
+ if "Chr" in line_items: # Header row
+ # Get the first column index containing genotypes
+ header_columns = line_items
+ for j, item in enumerate(line_items):
+ if item not in ["Chr", "Locus", "Mb", "cM"]:
+ geno_start_col = j
+ break
+
+ sample_list = line_items[geno_start_col:]
+ if not geno_start_col:
+ print("Check .geno file - expected columns not found")
+ sys.exit()
+ else: # Marker rows
+ this_marker = {
+ 'Chr': line_items[header_columns.index("Chr")],
+ 'Locus': line_items[header_columns.index("Locus")],
+ 'Mb': line_items[header_columns.index("Mb")],
+ 'cM': line_items[header_columns.index("cM")],
+ 'genotypes': [item.strip() for item in line_items][geno_start_col:]
+ }
+
+ markers.append(this_marker)
+
+ geno_dict['markers'] = markers
+
+ return geno_dict
+
+if __name__ == "__main__":
+ main(sys.argv)
+
diff --git a/gn2/maintenance/gen_select_dataset.py b/gn2/maintenance/gen_select_dataset.py
new file mode 100644
index 00000000..5f41da29
--- /dev/null
+++ b/gn2/maintenance/gen_select_dataset.py
@@ -0,0 +1,296 @@
+"""Script that generates the data for the main dropdown menus on the home page
+
+Writes out data as /static/new/javascript/dataset_menu_structure.json
+It needs to be run manually when database has been changed. Run it as
+
+ ./bin/genenetwork2 ~/my_settings.py -c ./wqflask/maintenance/gen_select_dataset.py
+
+"""
+
+
+# Copyright (C) University of Tennessee Health Science Center, Memphis, TN.
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Affero General Public License
+# as published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the GNU Affero General Public License for more details.
+#
+# This program is available from Source Forge: at GeneNetwork Project
+# (sourceforge.net/projects/genenetwork/).
+#
+# Contact Drs. Robert W. Williams
+# at rwilliams@uthsc.edu
+#
+#
+#
+# This module is used by GeneNetwork project (www.genenetwork.org)
+
+import sys
+
+# NEW: Note we prepend the current path - otherwise a guix instance of GN2 may be used instead
+sys.path.insert(0, './')
+# NEW: import app to avoid a circular dependency on utility.tools
+from gn2.wqflask import app
+
+from gn2.utility.tools import get_setting
+
+import simplejson as json
+import urllib.parse
+
+
+from pprint import pformat as pf
+
+from gn2.wqflask.database import database_connection
+
+
+def get_species(cursor):
+ """Build species list"""
+ #cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId")
+ cursor.execute("select Name, MenuName from Species order by OrderId")
+ species = list(cursor.fetchall())
+ return species
+
+
+def get_groups(cursor, species):
+ """Build groups list"""
+ groups = {}
+ for species_name, _species_full_name in species:
+ cursor.execute("""select InbredSet.Name, InbredSet.FullName from InbredSet,
+ Species,
+ ProbeFreeze, GenoFreeze, PublishFreeze where Species.Name = '%s'
+ and InbredSet.SpeciesId = Species.Id and
+ (PublishFreeze.InbredSetId = InbredSet.Id
+ or GenoFreeze.InbredSetId = InbredSet.Id
+ or ProbeFreeze.InbredSetId = InbredSet.Id)
+ group by InbredSet.Name
+ order by InbredSet.FullName""" % species_name)
+ results = cursor.fetchall()
+ groups[species_name] = list(results)
+ return groups
+
+
+def get_types(groups):
+ """Build types list"""
+ types = {}
+ #print("Groups: ", pf(groups))
+ for species, group_dict in list(groups.items()):
+ types[species] = {}
+ for group_name, _group_full_name in group_dict:
+ # make group an alias to shorten the code
+ #types[species][group_name] = [("Phenotypes", "Phenotypes"), ("Genotypes", "Genotypes")]
+ if phenotypes_exist(group_name):
+ types[species][group_name] = [("Phenotypes", "Phenotypes")]
+ if genotypes_exist(group_name):
+ if group_name in types[species]:
+ types[species][group_name] += [("Genotypes", "Genotypes")]
+ else:
+ types[species][group_name] = [("Genotypes", "Genotypes")]
+ if group_name in types[species]:
+ types_list = build_types(species, group_name)
+ if len(types_list) > 0:
+ types[species][group_name] += types_list
+ else:
+ if not phenotypes_exist(group_name) and not genotypes_exist(group_name):
+ types[species].pop(group_name, None)
+ groups[species] = tuple(
+ group for group in groups[species] if group[0] != group_name)
+ else: # ZS: This whole else statement might be unnecessary, need to check
+ types_list = build_types(species, group_name)
+ if len(types_list) > 0:
+ types[species][group_name] = types_list
+ else:
+ types[species].pop(group_name, None)
+ groups[species] = tuple(
+ group for group in groups[species] if group[0] != group_name)
+ return types
+
+
+def phenotypes_exist(group_name):
+ #print("group_name:", group_name)
+ Cursor.execute("""select Name from PublishFreeze
+ where PublishFreeze.Name = '%s'""" % (group_name + "Publish"))
+
+ results = Cursor.fetchone()
+ #print("RESULTS:", results)
+
+ if results != None:
+ return True
+ else:
+ return False
+
+
+def genotypes_exist(group_name):
+ #print("group_name:", group_name)
+ Cursor.execute("""select Name from GenoFreeze
+ where GenoFreeze.Name = '%s'""" % (group_name + "Geno"))
+
+ results = Cursor.fetchone()
+ #print("RESULTS:", results)
+
+ if results != None:
+ return True
+ else:
+ return False
+
+
+def build_types(species, group):
+ """Fetches tissues
+
+ Gets the tissues with data for this species/group
+ (all types except phenotype/genotype are tissues)
+
+ """
+
+ Cursor.execute("""select distinct Tissue.Name
+ from ProbeFreeze, ProbeSetFreeze, InbredSet, Tissue, Species
+ where Species.Name = '%s' and Species.Id = InbredSet.SpeciesId and
+ InbredSet.Name = '%s' and
+ ProbeFreeze.TissueId = Tissue.Id and
+ ProbeFreeze.InbredSetId = InbredSet.Id and
+ ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id and
+ ProbeSetFreeze.public > 0 and
+ ProbeSetFreeze.confidentiality < 1
+ order by Tissue.Name""" % (species, group))
+
+ results = []
+ for result in Cursor.fetchall():
+ if len(result):
+ these_datasets = build_datasets(species, group, result[0])
+ if len(these_datasets) > 0:
+ results.append((result[0], result[0]))
+
+ return results
+
+
+def get_datasets(types):
+ """Build datasets list"""
+ datasets = {}
+ for species, group_dict in list(types.items()):
+ datasets[species] = {}
+ for group, type_list in list(group_dict.items()):
+ datasets[species][group] = {}
+ for type_name in type_list:
+ these_datasets = build_datasets(species, group, type_name[0])
+ if len(these_datasets) > 0:
+ datasets[species][group][type_name[0]] = these_datasets
+
+ return datasets
+
+
+def build_datasets(species, group, type_name):
+ """Gets dataset names from database"""
+ dataset_text = dataset_value = None
+ datasets = []
+ if type_name == "Phenotypes":
+ Cursor.execute("""select InfoFiles.GN_AccesionId, PublishFreeze.Name, PublishFreeze.FullName from InfoFiles, PublishFreeze, InbredSet where
+ InbredSet.Name = '%s' and
+ PublishFreeze.InbredSetId = InbredSet.Id and
+ InfoFiles.InfoPageName = PublishFreeze.Name order by
+ PublishFreeze.CreateTime asc""" % group)
+
+ results = Cursor.fetchall()
+ if len(results) > 0:
+ for result in results:
+ print(result)
+ dataset_id = str(result[0])
+ dataset_value = str(result[1])
+ if group == 'MDP':
+ dataset_text = "Mouse Phenome Database"
+ else:
+ #dataset_text = "%s Phenotypes" % group
+ dataset_text = str(result[2])
+ datasets.append((dataset_id, dataset_value, dataset_text))
+ else:
+ dataset_id = "None"
+ dataset_value = "%sPublish" % group
+ dataset_text = "%s Phenotypes" % group
+ datasets.append((dataset_id, dataset_value, dataset_text))
+
+ elif type_name == "Genotypes":
+ Cursor.execute("""select InfoFiles.GN_AccesionId from InfoFiles, GenoFreeze, InbredSet where
+ InbredSet.Name = '%s' and
+ GenoFreeze.InbredSetId = InbredSet.Id and
+ InfoFiles.InfoPageName = GenoFreeze.ShortName and
+ GenoFreeze.public > 0 and
+ GenoFreeze.confidentiality < 1 order by
+ GenoFreeze.CreateTime desc""" % group)
+
+ results = Cursor.fetchone()
+ if results != None:
+ dataset_id = str(results[0])
+ else:
+ dataset_id = "None"
+ dataset_value = "%sGeno" % group
+ dataset_text = "%s Genotypes" % group
+ datasets.append((dataset_id, dataset_value, dataset_text))
+
+ else: # for mRNA expression/ProbeSet
+ Cursor.execute("""select ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName from
+ ProbeSetFreeze, ProbeFreeze, InbredSet, Tissue, Species where
+ Species.Name = '%s' and Species.Id = InbredSet.SpeciesId and
+ InbredSet.Name = '%s' and
+ ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id and Tissue.Name = '%s' and
+ ProbeFreeze.TissueId = Tissue.Id and ProbeFreeze.InbredSetId = InbredSet.Id and
+ ProbeSetFreeze.confidentiality < 1 and ProbeSetFreeze.public > 0 order by
+ ProbeSetFreeze.CreateTime desc""" % (species, group, type_name))
+
+ dataset_results = Cursor.fetchall()
+ datasets = []
+ for dataset_info in dataset_results:
+ this_dataset_info = []
+ for info in dataset_info:
+ this_dataset_info.append(str(info))
+ datasets.append(this_dataset_info)
+
+ return datasets
+
+
+def main(cursor):
+ """Generates and outputs (as json file) the data for the main dropdown menus on the home page"""
+
+ species = get_species(cursor)
+ groups = get_groups(cursor, species)
+ types = get_types(groups)
+ datasets = get_datasets(types)
+
+ #species.append(('All Species', 'All Species'))
+ #groups['All Species'] = [('All Groups', 'All Groups')]
+ #types['All Species'] = {}
+ #types['All Species']['All Groups'] = [('Phenotypes', 'Phenotypes')]
+ #datasets['All Species'] = {}
+ #datasets['All Species']['All Groups'] = {}
+ #datasets['All Species']['All Groups']['Phenotypes'] = [('All Phenotypes','All Phenotypes')]
+
+ data = dict(species=species,
+ groups=groups,
+ types=types,
+ datasets=datasets,
+ )
+
+ #print("data:", data)
+
+ output_file = """./wqflask/static/new/javascript/dataset_menu_structure.json"""
+
+ with open(output_file, 'w') as fh:
+ json.dump(data, fh, indent=" ", sort_keys=True)
+
+ #print("\nWrote file to:", output_file)
+
+
+def _test_it():
+ """Used for internal testing only"""
+ types = build_types("Mouse", "BXD")
+ #print("build_types:", pf(types))
+ datasets = build_datasets("Mouse", "BXD", "Hippocampus")
+ #print("build_datasets:", pf(datasets))
+
+
+if __name__ == '__main__':
+ with database_connection(get_setting("SQL_URI")) as conn:
+ with conn.cursor() as cursor:
+ main(cursor)
diff --git a/gn2/maintenance/generate_kinship_from_bimbam.py b/gn2/maintenance/generate_kinship_from_bimbam.py
new file mode 100644
index 00000000..9f01d094
--- /dev/null
+++ b/gn2/maintenance/generate_kinship_from_bimbam.py
@@ -0,0 +1,66 @@
+#!/usr/bin/python
+
+"""
+Generate relatedness matrix files for GEMMA from BIMBAM genotype/phenotype files
+
+This file goes through all of the BIMBAM files in the bimbam diretory
+and uses GEMMA to generate their corresponding kinship/relatedness matrix file
+
+"""
+
+import sys
+sys.path.append("..")
+import os
+import glob
+
+
+class GenerateKinshipMatrices:
+ def __init__(self, group_name, geno_file, pheno_file):
+ self.group_name = group_name
+ self.geno_file = geno_file
+ self.pheno_file = pheno_file
+
+ def generate_kinship(self):
+ gemma_command = "/gnu/store/xhzgjr0jvakxv6h3blj8z496xjig69b0-profile/bin/gemma -g " + self.geno_file + \
+ " -p " + self.pheno_file + \
+ " -gk 1 -outdir /home/zas1024/genotype_files/genotype/bimbam/ -o " + self.group_name
+ print("command:", gemma_command)
+ os.system(gemma_command)
+
+ @classmethod
+ def process_all(self, geno_dir, bimbam_dir):
+ os.chdir(geno_dir)
+ for input_file in glob.glob("*"):
+ if not input_file.endswith(('geno', '.geno.gz')):
+ continue
+ group_name = ".".join(input_file.split('.')[:-1])
+ if group_name == "HSNIH-Palmer":
+ continue
+ geno_input_file = os.path.join(
+ bimbam_dir, group_name + "_geno.txt")
+ pheno_input_file = os.path.join(
+ bimbam_dir, group_name + "_pheno.txt")
+ convertob = GenerateKinshipMatrices(
+ group_name, geno_input_file, pheno_input_file)
+ try:
+ convertob.generate_kinship()
+ except EmptyConfigurations as why:
+ print(" No config info? Continuing...")
+ continue
+ except Exception as why:
+
+ print(" Exception:", why)
+ print(traceback.print_exc())
+ print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos,
+ convertob.latest_col_pos))
+ print(" Column is:", convertob.latest_col_value)
+ print(" Row is:", convertob.latest_row_value)
+ break
+
+
+if __name__ == "__main__":
+ Geno_Directory = """/export/local/home/zas1024/genotype_files/genotype/"""
+ Bimbam_Directory = """/export/local/home/zas1024/genotype_files/genotype/bimbam/"""
+ GenerateKinshipMatrices.process_all(Geno_Directory, Bimbam_Directory)
+
+ # ./gemma -g /home/zas1024/genotype_files/genotype/bimbam/BXD_geno.txt -p /home/zas1024/genotype_files/genotype/bimbam/BXD_pheno.txt -gk 1 -o BXD
diff --git a/gn2/maintenance/generate_probesetfreeze_file.py b/gn2/maintenance/generate_probesetfreeze_file.py
new file mode 100644
index 00000000..00c2cddf
--- /dev/null
+++ b/gn2/maintenance/generate_probesetfreeze_file.py
@@ -0,0 +1,122 @@
+#!/usr/bin/python
+
+import sys
+
+# sys.path.insert(0, "..") - why?
+
+import os
+import collections
+import csv
+
+from gn2.base import webqtlConfig
+
+from pprint import pformat as pf
+
+from gn2.utility.tools import get_setting
+from gn2.wqflask.database import database_connection
+
+
+def show_progress(process, counter):
+ if counter % 1000 == 0:
+ print("{}: {}".format(process, counter))
+
+
+def get_strains(cursor):
+ cursor.execute("""select Strain.Name
+ from Strain, StrainXRef, InbredSet
+ where Strain.Id = StrainXRef.StrainId and
+ StrainXRef.InbredSetId = InbredSet.Id
+ and InbredSet.Name=%s;
+ """, "BXD")
+
+ strains = [strain[0] for strain in cursor.fetchall()]
+ print("strains:", pf(strains))
+ for strain in strains:
+ print(" -", strain)
+
+ return strains
+
+
+def get_probeset_vals(cursor, dataset_name):
+ cursor.execute(""" select ProbeSet.Id, ProbeSet.Name
+ from ProbeSetXRef,
+ ProbeSetFreeze,
+ ProbeSet
+ where ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and
+ ProbeSetFreeze.Name = %s and
+ ProbeSetXRef.ProbeSetId = ProbeSet.Id;
+ """, dataset_name)
+
+ probesets = cursor.fetchall()
+
+ print("Fetched probesets")
+
+ probeset_vals = collections.OrderedDict()
+
+ for counter, probeset in enumerate(probesets):
+ cursor.execute(""" select Strain.Name, ProbeSetData.value
+ from ProbeSetData, ProbeSetXRef, ProbeSetFreeze, Strain
+ where ProbeSetData.Id = ProbeSetXRef.DataId
+ and ProbeSetData.StrainId = Strain.Id
+ and ProbeSetXRef.ProbeSetId = %s
+ and ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId
+ and ProbeSetFreeze.Name = %s;
+ """, (probeset[0], dataset_name))
+ val_dic = collections.OrderedDict()
+ vals = cursor.fetchall()
+ for val in vals:
+ val_dic[val[0]] = val[1]
+
+ probeset_vals[probeset[1]] = val_dic
+ show_progress("Querying DB", counter)
+
+ return probeset_vals
+
+
+def trim_strains(strains, probeset_vals):
+ trimmed_strains = []
+ #print("probeset_vals is:", pf(probeset_vals))
+ first_probeset = list(probeset_vals.values())[0]
+ print("\n**** first_probeset is:", pf(first_probeset))
+ for strain in strains:
+ print("\n**** strain is:", pf(strain))
+ if strain in first_probeset:
+ trimmed_strains.append(strain)
+ print("trimmed_strains:", pf(trimmed_strains))
+ return trimmed_strains
+
+
+def write_data_matrix_file(strains, probeset_vals, filename):
+ with open(filename, "wb") as fh:
+ csv_writer = csv.writer(fh, delimiter=",", quoting=csv.QUOTE_ALL)
+ #print("strains is:", pf(strains))
+ csv_writer.writerow(['ID'] + strains)
+ for counter, probeset in enumerate(probeset_vals):
+ row_data = [probeset]
+ for strain in strains:
+ #print("probeset is: ", pf(probeset_vals[probeset]))
+ row_data.append(probeset_vals[probeset][strain])
+ #print("row_data is: ", pf(row_data))
+ csv_writer.writerow(row_data)
+ show_progress("Writing", counter)
+
+
+def main():
+ filename = os.path.expanduser(
+ "~/gene/wqflask/maintenance/"
+ "ProbeSetFreezeId_210_FullName_Eye_AXBXA_Illumina_V6.2"
+ "(Oct08)_RankInv_Beta.txt")
+ dataset_name = "Eye_AXBXA_1008_RankInv"
+
+ with database_connection(get_setting("SQL_URI")) as conn:
+ with conn.cursor() as cursor:
+ strains = get_strains(cursor)
+ print("Getting probset_vals")
+ probeset_vals = get_probeset_vals(cursor, dataset_name)
+ print("Finished getting probeset_vals")
+ trimmed_strains = trim_strains(strains, probeset_vals)
+ write_data_matrix_file(trimmed_strains, probeset_vals, filename)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/gn2/maintenance/geno_to_json.py b/gn2/maintenance/geno_to_json.py
new file mode 100644
index 00000000..7be2ed83
--- /dev/null
+++ b/gn2/maintenance/geno_to_json.py
@@ -0,0 +1,196 @@
+#!/usr/bin/python
+
+"""
+Convert .geno files to json
+
+This file goes through all of the genofiles in the genofile directory (.geno)
+and converts them to json files that are used when running the marker regression
+code
+
+"""
+
+import sys
+sys.path.append("..")
+import os
+import glob
+import traceback
+import gzip
+
+#import numpy as np
+#from pyLMM import lmm
+
+import simplejson as json
+
+from pprint import pformat as pf
+
+#from gn2.utility.tools import flat_files
+
+
+class EmptyConfigurations(Exception):
+ pass
+
+
+class Marker:
+ def __init__(self):
+ self.name = None
+ self.chr = None
+ self.cM = None
+ self.Mb = None
+ self.genotypes = []
+
+
+class ConvertGenoFile:
+
+ def __init__(self, input_file, output_file):
+
+ self.input_file = input_file
+ self.output_file = output_file
+
+ self.mb_exists = False
+ self.cm_exists = False
+ self.markers = []
+
+ self.latest_row_pos = None
+ self.latest_col_pos = None
+
+ self.latest_row_value = None
+ self.latest_col_value = None
+
+ def convert(self):
+
+ self.haplotype_notation = {
+ '@mat': "1",
+ '@pat': "0",
+ '@het': "0.5",
+ '@unk': "NA"
+ }
+
+ self.configurations = {}
+ #self.skipped_cols = 3
+
+ # if self.input_file.endswith(".geno.gz"):
+ # print("self.input_file: ", self.input_file)
+ # self.input_fh = gzip.open(self.input_file)
+ # else:
+ self.input_fh = open(self.input_file)
+
+ with open(self.output_file, "w") as self.output_fh:
+ # if self.file_type == "geno":
+ self.process_csv()
+ # elif self.file_type == "snps":
+ # self.process_snps_file()
+
+ def process_csv(self):
+ for row_count, row in enumerate(self.process_rows()):
+ row_items = row.split("\t")
+
+ this_marker = Marker()
+ this_marker.name = row_items[1]
+ this_marker.chr = row_items[0]
+ if self.cm_exists and self.mb_exists:
+ this_marker.cM = row_items[2]
+ this_marker.Mb = row_items[3]
+ genotypes = row_items[4:]
+ elif self.cm_exists:
+ this_marker.cM = row_items[2]
+ genotypes = row_items[3:]
+ elif self.mb_exists:
+ this_marker.Mb = row_items[2]
+ genotypes = row_items[3:]
+ else:
+ genotypes = row_items[2:]
+ for item_count, genotype in enumerate(genotypes):
+ if genotype.upper() in self.configurations:
+ this_marker.genotypes.append(
+ self.configurations[genotype.upper()])
+ else:
+ this_marker.genotypes.append("NA")
+
+ #print("this_marker is:", pf(this_marker.__dict__))
+ # if this_marker.chr == "14":
+ self.markers.append(this_marker.__dict__)
+
+ with open(self.output_file, 'w') as fh:
+ json.dump(self.markers, fh, indent=" ", sort_keys=True)
+
+ # print('configurations:', str(configurations))
+ #self.latest_col_pos = item_count + self.skipped_cols
+ #self.latest_col_value = item
+
+ # if item_count != 0:
+ # self.output_fh.write(" ")
+ # self.output_fh.write(self.configurations[item.upper()])
+
+ # self.output_fh.write("\n")
+
+ def process_rows(self):
+ for self.latest_row_pos, row in enumerate(self.input_fh):
+ # if self.input_file.endswith(".geno.gz"):
+ # print("row: ", row)
+ self.latest_row_value = row
+ # Take care of headers
+ if not row.strip():
+ continue
+ if row.startswith('#'):
+ continue
+ if row.startswith('Chr'):
+ if 'Mb' in row.split():
+ self.mb_exists = True
+ if 'cM' in row.split():
+ self.cm_exists = True
+ continue
+ if row.startswith('@'):
+ key, _separater, value = row.partition(':')
+ key = key.strip()
+ value = value.strip()
+ if key in self.haplotype_notation:
+ self.configurations[value] = self.haplotype_notation[key]
+ continue
+ if not len(self.configurations):
+ raise EmptyConfigurations
+ yield row
+
+ @classmethod
+ def process_all(cls, old_directory, new_directory):
+ os.chdir(old_directory)
+ for input_file in glob.glob("*"):
+ if not input_file.endswith(('geno', '.geno.gz')):
+ continue
+ group_name = ".".join(input_file.split('.')[:-1])
+ output_file = os.path.join(new_directory, group_name + ".json")
+ print("%s -> %s" % (
+ os.path.join(old_directory, input_file), output_file))
+ convertob = ConvertGenoFile(input_file, output_file)
+ try:
+ convertob.convert()
+ except EmptyConfigurations as why:
+ print(" No config info? Continuing...")
+ #excepted = True
+ continue
+ except Exception as why:
+
+ print(" Exception:", why)
+ print(traceback.print_exc())
+ print(" Found in row %s at tabular column %s" % (convertob.latest_row_pos,
+ convertob.latest_col_pos))
+ print(" Column is:", convertob.latest_col_value)
+ print(" Row is:", convertob.latest_row_value)
+ break
+
+ # def process_snps_file(cls, snps_file, new_directory):
+ # output_file = os.path.join(new_directory, "mouse_families.json")
+ # print("%s -> %s" % (snps_file, output_file))
+ # convertob = ConvertGenoFile(input_file, output_file)
+
+
+if __name__ == "__main__":
+ Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype"""
+ New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/json"""
+ #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
+ #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
+ #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
+ # convertob.convert()
+ ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
+ # ConvertGenoFiles(Geno_Directory)
+
+ #process_csv(Input_File, Output_File)
diff --git a/gn2/maintenance/get_group_samplelists.py b/gn2/maintenance/get_group_samplelists.py
new file mode 100644
index 00000000..6af637ea
--- /dev/null
+++ b/gn2/maintenance/get_group_samplelists.py
@@ -0,0 +1,47 @@
+import os
+import glob
+import gzip
+
+from gn2.base import webqtlConfig
+
+
+def get_samplelist(file_type, geno_file):
+ if file_type == "geno":
+ return get_samplelist_from_geno(geno_file)
+ elif file_type == "plink":
+ return get_samplelist_from_plink(geno_file)
+
+
+def get_samplelist_from_geno(genofilename):
+ if os.path.isfile(genofilename + '.gz'):
+ genofilename += '.gz'
+ genofile = gzip.open(genofilename)
+ else:
+ genofile = open(genofilename)
+
+ for line in genofile:
+ line = line.strip()
+ if not line:
+ continue
+ if line.startswith(("#", "@")):
+ continue
+ break
+
+ headers = line.split("\t")
+
+ if headers[3] == "Mb":
+ samplelist = headers[4:]
+ else:
+ samplelist = headers[3:]
+ return samplelist
+
+
+def get_samplelist_from_plink(genofilename):
+ genofile = open(genofilename)
+
+ samplelist = []
+ for line in genofile:
+ line = line.split(" ")
+ samplelist.append(line[1])
+
+ return samplelist
diff --git a/gn2/maintenance/print_benchmark.py b/gn2/maintenance/print_benchmark.py
new file mode 100644
index 00000000..9d12da8a
--- /dev/null
+++ b/gn2/maintenance/print_benchmark.py
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+
+import time
+
+from pprint import pformat as pf
+
+
+class TheCounter:
+ Counters = {}
+
+ def __init__(self):
+ start_time = time.time()
+ for counter in range(170000):
+ self.print_it(counter)
+ self.time_took = time.time() - start_time
+ TheCounter.Counters[self.__class__.__name__] = self.time_took
+
+
+class PrintAll(TheCounter):
+ def print_it(self, counter):
+ print(counter)
+
+
+class PrintSome(TheCounter):
+ def print_it(self, counter):
+ if counter % 1000 == 0:
+ print(counter)
+
+
+class PrintNone(TheCounter):
+ def print_it(self, counter):
+ pass
+
+
+def new_main():
+ print("Running new_main")
+ tests = [PrintAll, PrintSome, PrintNone]
+ for test in tests:
+ test()
+
+ print(pf(TheCounter.Counters))
+
+
+if __name__ == '__main__':
+ new_main()
diff --git a/gn2/maintenance/quantile_normalize.py b/gn2/maintenance/quantile_normalize.py
new file mode 100644
index 00000000..5620b552
--- /dev/null
+++ b/gn2/maintenance/quantile_normalize.py
@@ -0,0 +1,98 @@
+import sys
+sys.path.insert(0, './')
+import urllib.parse
+
+import numpy as np
+import pandas as pd
+
+from flask import Flask, g, request
+
+from gn2.wqflask import app
+from gn2.wqflask.database import database_connection
+from gn2.utility.tools import get_setting
+
+
+def create_dataframe(input_file):
+ with open(input_file) as f:
+ ncols = len(f.readline().split("\t"))
+
+ input_array = np.loadtxt(open(
+ input_file, "rb"), delimiter="\t", skiprows=1, usecols=list(range(1, ncols)))
+ return pd.DataFrame(input_array)
+
+# This function taken from https://github.com/ShawnLYU/Quantile_Normalize
+
+
+def quantileNormalize(df_input):
+ df = df_input.copy()
+ # compute rank
+ dic = {}
+ for col in df:
+ dic.update({col: sorted(df[col])})
+ sorted_df = pd.DataFrame(dic)
+ rank = sorted_df.mean(axis=1).tolist()
+ # sort
+ for col in df:
+ t = np.searchsorted(np.sort(df[col]), df[col])
+ df[col] = [rank[i] for i in t]
+ return df
+
+
+def set_data(cursor, dataset_name):
+ orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt"
+
+ sample_list = []
+ with open(orig_file, 'r') as orig_fh, open('/home/zas1024/cfw_data/quant_norm.csv', 'r') as quant_fh:
+ for i, (line1, line2) in enumerate(zip(orig_fh, quant_fh)):
+ trait_dict = {}
+ sample_list = []
+ if i == 0:
+ sample_names = line1.split('\t')[1:]
+ else:
+ trait_name = line1.split('\t')[0]
+ for i, sample in enumerate(sample_names):
+ this_sample = {
+ "name": sample,
+ "value": line1.split('\t')[i + 1],
+ "qnorm": line2.split('\t')[i + 1]
+ }
+ sample_list.append(this_sample)
+ query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName
+ FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet
+ WHERE Species.Id = InbredSet.SpeciesId and
+ InbredSet.Id = ProbeFreeze.InbredSetId and
+ ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId and
+ ProbeSetFreeze.Name = '%s' and
+ ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId and
+ ProbeSetXRef.ProbeSetId = ProbeSet.Id and
+ ProbeSet.Name = '%s'""" % (dataset_name, line1.split('\t')[0])
+ cursor.execute(query)
+ result_info = cursor.fetchone()
+
+ yield {
+ "_index": "traits",
+ "_type": "trait",
+ "_source": {
+ "name": trait_name,
+ "species": result_info[0],
+ "group": result_info[1],
+ "dataset": dataset_name,
+ "dataset_fullname": result_info[2],
+ "samples": sample_list,
+ "transform_types": "qnorm"
+ }
+ }
+
+
+if __name__ == '__main__':
+ with database_connection(get_setting("SQL_URI")) as conn:
+ with conn.cursor() as cursor:
+ success, _ = bulk(es, set_data(cursor, sys.argv[1]))
+
+ response = es.search(
+ index="traits", doc_type="trait", body={
+ "query": {"match": {"name": "ENSMUSG00000028982"}}
+ }
+ )
+
+ print(response)
diff --git a/gn2/maintenance/set_resource_defaults.py b/gn2/maintenance/set_resource_defaults.py
new file mode 100644
index 00000000..f9e5494a
--- /dev/null
+++ b/gn2/maintenance/set_resource_defaults.py
@@ -0,0 +1,153 @@
+"""
+
+Script that sets default resource access masks for use with the DB proxy
+
+Defaults will be:
+Owner - omni_gn
+Mask - Public/non-confidential: { data: "view",
+ metadata: "view",
+ admin: "not-admin" }
+ Private/confidentia: { data: "no-access",
+ metadata: "no-access",
+ admin: "not-admin" }
+
+To run:
+./bin/genenetwork2 ~/my_settings.py -c ./wqflask/maintenance/gen_select_dataset.py
+
+"""
+
+import sys
+import json
+
+# NEW: Note we prepend the current path - otherwise a guix instance of GN2 may be used instead
+sys.path.insert(0, './')
+
+# NEW: import app to avoid a circular dependency on utility.tools
+from gn2.wqflask import app
+
+from gn2.utility import hmac
+from gn2.utility.tools import get_setting
+from gn2.utility.redis_tools import get_redis_conn, get_user_id, add_resource, get_resources, get_resource_info
+Redis = get_redis_conn()
+
+import urllib.parse
+
+from gn2.wqflask.database import database_connection
+
+
+def insert_probeset_resources(cursor, default_owner_id):
+ current_resources = Redis.hgetall("resources")
+ cursor.execute(""" SELECT
+ ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.confidentiality, ProbeSetFreeze.public
+ FROM
+ ProbeSetFreeze""")
+
+ resource_results = cursor.fetchall()
+ for i, resource in enumerate(resource_results):
+ resource_ob = {}
+ resource_ob['name'] = resource[1]
+ resource_ob['owner_id'] = default_owner_id
+ resource_ob['data'] = {"dataset": str(resource[0])}
+ resource_ob['type'] = "dataset-probeset"
+ if resource[2] < 1 and resource[3] > 0:
+ resource_ob['default_mask'] = {"data": "view",
+ "metadata": "view",
+ "admin": "not-admin"}
+ else:
+ resource_ob['default_mask'] = {"data": "no-access",
+ "metadata": "no-access",
+ "admin": "not-admin"}
+ resource_ob['group_masks'] = {}
+
+ add_resource(resource_ob, update=False)
+
+
+def insert_publish_resources(cursor, default_owner_id):
+ current_resources = Redis.hgetall("resources")
+ cursor.execute(""" SELECT
+ PublishXRef.Id, PublishFreeze.Id, InbredSet.InbredSetCode
+ FROM
+ PublishXRef, PublishFreeze, InbredSet, Publication
+ WHERE
+ PublishFreeze.InbredSetId = PublishXRef.InbredSetId AND
+ InbredSet.Id = PublishXRef.InbredSetId AND
+ Publication.Id = PublishXRef.PublicationId""")
+
+ resource_results = cursor.fetchall()
+ for resource in resource_results:
+ if resource[2]:
+ resource_ob = {}
+ if resource[2]:
+ resource_ob['name'] = resource[2] + "_" + str(resource[0])
+ else:
+ resource_ob['name'] = str(resource[0])
+ resource_ob['owner_id'] = default_owner_id
+ resource_ob['data'] = {"dataset": str(resource[1]),
+ "trait": str(resource[0])}
+ resource_ob['type'] = "dataset-publish"
+ resource_ob['default_mask'] = {"data": "view",
+ "metadata": "view",
+ "admin": "not-admin"}
+
+ resource_ob['group_masks'] = {}
+
+ add_resource(resource_ob, update=False)
+ else:
+ continue
+
+
+def insert_geno_resources(cursor, default_owner_id):
+ current_resources = Redis.hgetall("resources")
+ cursor.execute(""" SELECT
+ GenoFreeze.Id, GenoFreeze.ShortName, GenoFreeze.confidentiality
+ FROM
+ GenoFreeze""")
+
+ resource_results = cursor.fetchall()
+ for i, resource in enumerate(resource_results):
+ resource_ob = {}
+ resource_ob['name'] = resource[1]
+ if resource[1] == "HET3-ITPGeno":
+ resource_ob['owner_id'] = "c5ce8c56-78a6-474f-bcaf-7129d97f56ae"
+ else:
+ resource_ob['owner_id'] = default_owner_id
+ resource_ob['data'] = {"dataset": str(resource[0])}
+ resource_ob['type'] = "dataset-geno"
+ if resource[2] < 1:
+ resource_ob['default_mask'] = {"data": "view",
+ "metadata": "view",
+ "admin": "not-admin"}
+ else:
+ resource_ob['default_mask'] = {"data": "no-access",
+ "metadata": "no-access",
+ "admin": "not-admin"}
+ resource_ob['group_masks'] = {}
+
+ add_resource(resource_ob, update=False)
+
+
+def insert_resources(default_owner_id):
+ current_resources = get_resources()
+ print("START")
+ insert_publish_resources(cursor, default_owner_id)
+ print("AFTER PUBLISH")
+ insert_geno_resources(cursor, default_owner_id)
+ print("AFTER GENO")
+ insert_probeset_resources(cursor, default_owner_id)
+ print("AFTER PROBESET")
+
+
+def main(cursor):
+ """Generates and outputs (as json file) the data for the main dropdown menus on the home page"""
+
+ Redis.delete("resources")
+
+ owner_id = "c5ce8c56-78a6-474f-bcaf-7129d97f56ae"
+
+ insert_resources(owner_id)
+
+
+if __name__ == '__main__':
+ with database_connection(get_setting("SQL_URI")) as conn:
+ with conn.cursor() as cursor:
+ main(cursor)