Namespace all modules under gn2.

We move all modules under a gn2 directory. This is important for "correct" packaging and deployment as a Guix service.
author: Arun Isaac 2023-12-29 18:55:37 +0000
committer: Arun Isaac 2023-12-29 19:01:46 +0000
commit: 204a308be0f741726b9a620d88fbc22b22124c81 (patch)
tree: b3cf66906674020b530c844c2bb4982c8a0e2d39 /gn2/maintenance
parent: 83062c75442160427b50420161bfcae2c5c34c84 (diff)
download: genenetwork2-204a308be0f741726b9a620d88fbc22b22124c81.tar.gz
13 files changed, 1553 insertions, 0 deletions
diff --git a/gn2/maintenance/README.md b/gn2/maintenance/README.md
new file mode 100644
index 00000000..873eaa32
--- /dev/null
+++ b/gn2/maintenance/README.md
@@ -0,0 +1,4 @@
+Maintenance files have been moved into a separate repository named
+*gn_extra*. See https://github.com/genenetwork/gn_extra
+
+
diff --git a/gn2/maintenance/__init__.py b/gn2/maintenance/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/gn2/maintenance/__init__.py
diff --git a/gn2/maintenance/convert_dryad_to_bimbam.py b/gn2/maintenance/convert_dryad_to_bimbam.py
new file mode 100644
index 00000000..18fbb8a1
--- /dev/null
+++ b/gn2/maintenance/convert_dryad_to_bimbam.py
@@ -0,0 +1,72 @@
+#!/usr/bin/python
+
+"""
+Convert data dryad files to a BIMBAM _geno and _snps file
+
+
+"""
+
+import sys
+sys.path.append("..")
+
+
+def read_dryad_file(filename):
+    exclude_count = 0
+    marker_list = []
+    sample_dict = {}
+    sample_list = []
+    geno_rows = []
+    with open(filename, 'r') as the_file:
+        for i, line in enumerate(the_file):
+            if i > 0:
+                if line.split(" ")[1] == "no":
+                    sample_name = line.split(" ")[0]
+                    sample_list.append(sample_name)
+                    sample_dict[sample_name] = line.split(" ")[2:]
+                else:
+                    exclude_count += 1
+            else:
+                marker_list = line.split(" ")[2:]
+
+    for i, marker in enumerate(marker_list):
+        this_row = []
+        this_row.append(marker)
+        this_row.append("X")
+        this_row.append("Y")
+        for sample in sample_list:
+            this_row.append(sample_dict[sample][i])
+        geno_rows.append(this_row)
+
+    print(exclude_count)
+
+    return geno_rows
+
+    # for i, marker in enumerate(marker_list):
+    #    this_row = []
+    #    this_row.append(marker)
+    #    this_row.append("X")
+    #    this_row.append("Y")
+    #    with open(filename, 'r') as the_file:
+    #        for j, line in enumerate(the_file):
+    #            if j > 0:
+    #                this_row.append(line.split(" ")[i+2])
+    #        print("row: " + str(i))
+    #        geno_rows.append(this_row)
+    #
+    # return geno_rows
+
+
+def write_bimbam_files(geno_rows):
+    with open('/home/zas1024/cfw_data/CFW_geno.txt', 'w') as geno_fh:
+        for row in geno_rows:
+            geno_fh.write(", ".join(row) + "\n")
+
+
+def convert_dryad_to_bimbam(filename):
+    geno_file_rows = read_dryad_file(filename)
+    write_bimbam_files(geno_file_rows)
+
+
+if __name__ == "__main__":
+    input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt"
+    convert_dryad_to_bimbam(input_filename)
diff --git a/gn2/maintenance/convert_geno_to_bimbam.py b/gn2/maintenance/convert_geno_to_bimbam.py
new file mode 100644
index 00000000..078be529
--- /dev/null
+++ b/gn2/maintenance/convert_geno_to_bimbam.py
@@ -0,0 +1,201 @@
+#!/usr/bin/python
+
+"""
+Convert .geno files to json
+
+This file goes through all of the genofiles in the genofile directory (.geno)
+and converts them to json files that are used when running the marker regression
+code
+
+"""
+
+import sys
+sys.path.append("..")
+import os
+import glob
+import traceback
+import gzip
+
+import simplejson as json
+
+from pprint import pformat as pf
+
+
+class EmptyConfigurations(Exception):
+    pass
+
+
+class Marker:
+    def __init__(self):
+        self.name = None
+        self.chr = None
+        self.cM = None
+        self.Mb = None
+        self.genotypes = []
+
+
+class ConvertGenoFile:
+
+    def __init__(self, input_file, output_files):
+        self.input_file = input_file
+        self.output_files = output_files
+
+        self.mb_exists = False
+        self.cm_exists = False
+        self.markers = []
+
+        self.latest_row_pos = None
+        self.latest_col_pos = None
+
+        self.latest_row_value = None
+        self.latest_col_value = None
+
+    def convert(self):
+        self.haplotype_notation = {
+            '@mat': "1",
+            '@pat': "0",
+            '@het': "0.5",
+            '@unk': "NA"
+        }
+
+        self.configurations = {}
+        self.input_fh = open(self.input_file)
+
+        self.process_csv()
+
+    def process_csv(self):
+        for row in self.process_rows():
+            row_items = row.split("\t")
+
+            this_marker = Marker()
+            this_marker.name = row_items[1]
+            this_marker.chr = row_items[0]
+            if self.cm_exists and self.mb_exists:
+                this_marker.cM = row_items[2]
+                this_marker.Mb = row_items[3]
+                genotypes = row_items[4:]
+            elif self.cm_exists:
+                this_marker.cM = row_items[2]
+                genotypes = row_items[3:]
+            elif self.mb_exists:
+                this_marker.Mb = row_items[2]
+                genotypes = row_items[3:]
+            else:
+                genotypes = row_items[2:]
+            for item_count, genotype in enumerate(genotypes):
+                if genotype.upper().strip() in self.configurations:
+                    this_marker.genotypes.append(
+                        self.configurations[genotype.upper().strip()])
+                else:
+                    this_marker.genotypes.append("NA")
+
+            self.markers.append(this_marker.__dict__)
+
+        self.write_to_bimbam()
+
+    def write_to_bimbam(self):
+        with open(self.output_files[0], "w") as geno_fh:
+            for marker in self.markers:
+                geno_fh.write(marker['name'])
+                geno_fh.write(", X, Y")
+                geno_fh.write(", " + ", ".join(marker['genotypes']))
+                geno_fh.write("\n")
+
+        with open(self.output_files[1], "w") as pheno_fh:
+            for sample in self.sample_list:
+                pheno_fh.write("1\n")
+
+        with open(self.output_files[2], "w") as snp_fh:
+            for marker in self.markers:
+                if self.mb_exists:
+                    snp_fh.write(
+                        marker['name'] + ", " + str(int(float(marker['Mb']) * 1000000)) + ", " + marker['chr'] + "\n")
+                else:
+                    snp_fh.write(
+                        marker['name'] + ", " + str(int(float(marker['cM']) * 1000000)) + ", " + marker['chr'] + "\n")
+
+    def get_sample_list(self, row_contents):
+        self.sample_list = []
+        if self.mb_exists:
+            if self.cm_exists:
+                self.sample_list = row_contents[4:]
+            else:
+                self.sample_list = row_contents[3:]
+        else:
+            if self.cm_exists:
+                self.sample_list = row_contents[3:]
+            else:
+                self.sample_list = row_contents[2:]
+
+    def process_rows(self):
+        for self.latest_row_pos, row in enumerate(self.input_fh):
+            self.latest_row_value = row
+            # Take care of headers
+            if not row.strip():
+                continue
+            if row.startswith('#'):
+                continue
+            if row.startswith('Chr'):
+                if 'Mb' in row.split():
+                    self.mb_exists = True
+                if 'cM' in row.split():
+                    self.cm_exists = True
+                self.get_sample_list(row.split())
+                continue
+            if row.startswith('@'):
+                key, _separater, value = row.partition(':')
+                key = key.strip()
+                value = value.strip()
+                if key == "@filler":
+                    raise EmptyConfigurations
+                if key in self.haplotype_notation:
+                    self.configurations[value] = self.haplotype_notation[key]
+                continue
+            if not len(self.configurations):
+                raise EmptyConfigurations
+            yield row
+
+    @classmethod
+    def process_all(cls, old_directory, new_directory):
+        os.chdir(old_directory)
+        for input_file in glob.glob("*"):
+            if not input_file.endswith(('geno', '.geno.gz')):
+                continue
+            group_name = ".".join(input_file.split('.')[:-1])
+            if group_name == "HSNIH-Palmer":
+                continue
+            geno_output_file = os.path.join(
+                new_directory, group_name + "_geno.txt")
+            pheno_output_file = os.path.join(
+                new_directory, group_name + "_pheno.txt")
+            snp_output_file = os.path.join(
+                new_directory, group_name + "_snps.txt")
+            output_files = [geno_output_file,
+                            pheno_output_file, snp_output_file]
+            print("%s -> %s" % (
+                os.path.join(old_directory, input_file), geno_output_file))
+            convertob = ConvertGenoFile(input_file, output_files)
+            try:
+                convertob.convert()
+            except EmptyConfigurations as why:
+                print("  No config info? Continuing...")
+                continue
+            except Exception as why:
+                print("  Exception:", why)
+                print(traceback.print_exc())
+                print("    Found in row %s at tabular column %s" % (convertob.latest_row_pos,
+                                                                    convertob.latest_col_pos))
+                print("    Column is:", convertob.latest_col_value)
+                print("    Row is:", convertob.latest_row_value)
+                break
+
+
+if __name__ == "__main__":
+    Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype"""
+    New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/bimbam"""
+    #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
+    #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
+    #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
+    # convertob.convert()
+    ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
+    # ConvertGenoFiles(Geno_Directory)
diff --git a/gn2/maintenance/gen_ind_genofiles.py b/gn2/maintenance/gen_ind_genofiles.py
new file mode 100644
index 00000000..b755c648
--- /dev/null
+++ b/gn2/maintenance/gen_ind_genofiles.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""A script that generates the genotype files for groups of individuals, using an existing strain genotype file as a basis
+
+Example commands:
+python3 gen_ind_genofiles.py
+        /home/zas1024/gn2-zach/genotype_files/genotype/
+        /home/zas1024/gn2-zach/new_geno/
+        BXD-Micturition.geno
+        BXD.json
+python3 gen_ind_genofiles.py
+        /home/zas1024/gn2-zach/genotype_files/genotype
+        /home/zas1024/gn2-zach/new_geno/
+        BXD-Micturition.geno
+        BXD.2.geno BXD.4.geno BXD.5.geno
+
+"""
+
+import json
+import os
+import sys
+from typing import List
+
+import MySQLdb
+
+def conn():
+    return MySQLdb.Connect(db=os.environ.get("DB_NAME"),
+                           user=os.environ.get("DB_USER"),
+                           passwd=os.environ.get("DB_PASS"),
+                           host=os.environ.get("DB_HOST"))
+
+def main(args):
+
+    # Directory in which .geno files are located
+    geno_dir = args[1]
+
+    # Directory in which to output new files
+    out_dir = args[2]
+
+    # The individuals group that we want to generate a .geno file for
+    target_file = geno_dir + args[3]
+
+    # The source group(s) we're generating the .geno files from
+    # This can be passed as either a specific .geno file (or set of files as multiple arguments),
+    # or as a JSON file containing a set of .geno files (and their corresponding file names and sample lists)
+    geno_json = {}
+    source_files = []
+    if ".json" in args[4]:
+        geno_json = json.load(open(geno_dir + args[4], "r"))
+        par_f1s = {
+            "mat": geno_json['mat'],
+            "pat": geno_json['pat'],
+            "f1s": geno_json['f1s']
+        }
+
+        # List of file titles and locations from JSON
+        source_files = [{'title': genofile['title'], 'location': geno_dir + genofile['location']} for genofile in geno_json['genofile']]
+    else:
+        par_f1s = {}
+        # List of files directly taken from command line arguments, with titles just set to the filename
+        for group in args[4:]:
+            file_name = geno_dir + group + ".geno" if ".geno" not in group else geno_dir + group
+            source_files.append({'title': file_name[:-5], 'location': file_name})
+
+    if len(source_files) > 1:
+        # Generate a JSON file pointing to the new target genotype files, in situations where there are multiple source .geno files
+        target_json_loc = out_dir + ".".join(args[3].split(".")[:-1]) + ".json"
+        target_json = {'genofile': []}
+
+        # Generate the output .geno files
+        for source_file in source_files:
+            filename, samples = generate_new_genofile(source_file['location'], target_file, par_f1s, out_dir)
+
+            target_json['genofile'].append({
+                'location': filename.split("/")[-1],
+                'title': source_file['title'],
+                'sample_list': samples
+            })
+
+        json.dump(target_json, open(target_json_loc, "w"))
+    else:
+        filename, samples = generate_new_genofile(source_files[0]['location'], target_file, par_f1s, out_dir)
+
+def get_strain_for_sample(sample):
+    query = (
+        "SELECT CaseAttributeXRefNew.Value "
+        "FROM CaseAttributeXRefNew, Strain "
+        "WHERE CaseAttributeXRefNew.CaseAttributeId=11 "
+        "AND CaseAttributeXRefNew.StrainId = Strain.Id "
+        "AND Strain.Name = %(name)s" )
+
+    with conn().cursor() as cursor:
+        cursor.execute(query, {"name": sample.strip()})
+        strain = cursor.fetchone()[0]
+        return strain
+
+def generate_new_genofile(source_genofile, target_genofile, par_f1s, out_dir):
+    source_samples = group_samples(source_genofile)
+    source_genotypes = strain_genotypes(source_genofile)
+    target_samples = group_samples(target_genofile)
+    strain_pos_map = map_strain_pos_to_target_group(source_samples, target_samples, par_f1s)
+
+    if len(source_genofile.split("/")[-1].split(".")) > 2:
+        # The number in the source genofile; for example 4 in BXD.4.geno
+        source_num = source_genofile.split("/")[-1].split(".")[-2]
+        target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + "." + source_num + ".geno"
+    else:
+        target_filename = ".".join(target_genofile.split("/")[-1].split(".")[:-1]) + ".geno"
+
+    file_location = out_dir + target_filename
+
+    with open(file_location, "w") as fh:
+        for metadata in ["name", "type", "mat", "pat", "het", "unk"]:
+            fh.write("@" + metadata + ":" + source_genotypes[metadata] + "\n")
+
+        header_line = ["Chr", "Locus", "cM", "Mb"] + target_samples
+        fh.write("\t".join(header_line) + "\n")
+
+        for marker in source_genotypes['markers']:
+            line_items = [
+                marker['Chr'],
+                marker['Locus'],
+                marker['cM'],
+                marker['Mb']
+            ]
+
+            for pos in strain_pos_map:
+                if isinstance(pos, int):
+                    line_items.append(marker['genotypes'][pos])
+                else:
+                    if pos in ["mat", "pat"]:
+                        line_items.append(source_genotypes[pos])
+                    elif pos == "f1s":
+                        line_items.append("H")
+                    else:
+                        line_items.append("U")
+
+            fh.write("\t".join(line_items) + "\n")
+
+    return file_location, target_samples
+
+def map_strain_pos_to_target_group(source_samples, target_samples, par_f1s):
+    """
+    Retrieve corresponding strain position for each sample in the target group
+
+    This is so the genotypes from the base genofile can be mapped to the samples in the target group
+
+    For example:
+    Base strains: BXD1, BXD2, BXD3
+    Target samples: BXD1_1, BXD1_2, BXD2_1, BXD3_1, BXD3_2, BXD3_3
+    Returns: [0, 0, 1, 2, 2, 2]
+    """
+    pos_map = []
+    for sample in target_samples:
+        sample_strain = get_strain_for_sample(sample)
+        if sample_strain in source_samples:
+            pos_map.append(source_samples.index(sample_strain))
+        else:
+            val = "U"
+            for key in par_f1s.keys():
+                if sample_strain in par_f1s[key]:
+                    val = key
+            pos_map.append(val)
+
+    return pos_map
+
+def group_samples(target_file: str) -> List:
+    """
+    Get the group samples from its "dummy" .geno file (which still contains the sample list)
+    """
+
+    sample_list = []
+    with open(target_file, "r") as target_geno:
+        for i, line in enumerate(target_geno):
+            # Skip header lines
+            if line[0] in ["#", "@"] or not len(line):
+                continue
+    
+            line_items = line.split()
+
+            sample_list = [item for item in line_items if item not in ["Chr", "Locus", "Mb", "cM"]]
+            break
+
+    return sample_list
+
+def strain_genotypes(strain_genofile: str) -> List:
+    """
+    Read genotypes from source strain .geno file
+
+    :param strain_genofile: string of genofile filename
+    :return: a list of dictionaries representing each marker's genotypes
+
+    Example output: [
+        {
+            'Chr': '1',
+            'Locus': 'marker1',
+            'Mb': '10.0',
+            'cM': '8.0',
+            'genotypes': [('BXD1', 'B'), ('BXD2', 'D'), ('BXD3', 'H'), ...]
+        },
+        ...
+    ]
+    """
+
+    geno_dict = {}
+
+    geno_start_col = None
+    header_columns = []
+    sample_list = []
+    markers = []
+    with open(strain_genofile, "r") as source_geno:
+        for i, line in enumerate(source_geno):
+            if line[0] == "@":
+                metadata_type = line[1:].split(":")[0]
+                if metadata_type in ['name', 'type', 'mat', 'pat', 'het', 'unk']:
+                    geno_dict[metadata_type] = line.split(":")[1].strip()
+
+                continue
+
+            # Skip other header lines
+            if line[0] == "#" or not len(line):
+                continue
+
+            line_items = line.split("\t")
+            if "Chr" in line_items: # Header row
+                # Get the first column index containing genotypes
+                header_columns = line_items
+                for j, item in enumerate(line_items):
+                    if item not in ["Chr", "Locus", "Mb", "cM"]:
+                        geno_start_col = j
+                        break
+
+                sample_list = line_items[geno_start_col:]
+                if not geno_start_col:
+                    print("Check .geno file - expected columns not found")
+                    sys.exit()
+            else: # Marker rows
+                this_marker = {
+                    'Chr': line_items[header_columns.index("Chr")],
+                    'Locus': line_items[header_columns.index("Locus")],
+                    'Mb': line_items[header_columns.index("Mb")],
+                    'cM': line_items[header_columns.index("cM")],
+                    'genotypes': [item.strip() for item in line_items][geno_start_col:]
+                }
+
+                markers.append(this_marker)
+
+    geno_dict['markers'] = markers
+
+    return geno_dict
+            
+if __name__ == "__main__":
+    main(sys.argv)
+
diff --git a/gn2/maintenance/gen_select_dataset.py b/gn2/maintenance/gen_select_dataset.py
new file mode 100644
index 00000000..5f41da29
--- /dev/null
+++ b/gn2/maintenance/gen_select_dataset.py
@@ -0,0 +1,296 @@
+"""Script that generates the data for the main dropdown menus on the home page
+
+Writes out data as /static/new/javascript/dataset_menu_structure.json
+It needs to be run manually when database has been changed. Run it as
+
+  ./bin/genenetwork2 ~/my_settings.py -c ./wqflask/maintenance/gen_select_dataset.py
+
+"""
+
+
+# Copyright (C) University of Tennessee Health Science Center, Memphis, TN.
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU Affero General Public License
+# as published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the GNU Affero General Public License for more details.
+#
+# This program is available from Source Forge: at GeneNetwork Project
+# (sourceforge.net/projects/genenetwork/).
+#
+# Contact Drs. Robert W. Williams
+# at rwilliams@uthsc.edu
+#
+#
+#
+# This module is used by GeneNetwork project (www.genenetwork.org)
+
+import sys
+
+# NEW: Note we prepend the current path - otherwise a guix instance of GN2 may be used instead
+sys.path.insert(0, './')
+# NEW: import app to avoid a circular dependency on utility.tools
+from gn2.wqflask import app
+
+from gn2.utility.tools import get_setting
+
+import simplejson as json
+import urllib.parse
+
+
+from pprint import pformat as pf
+
+from gn2.wqflask.database import database_connection
+
+
+def get_species(cursor):
+    """Build species list"""
+    #cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId")
+    cursor.execute("select Name, MenuName from Species order by OrderId")
+    species = list(cursor.fetchall())
+    return species
+
+
+def get_groups(cursor, species):
+    """Build groups list"""
+    groups = {}
+    for species_name, _species_full_name in species:
+        cursor.execute("""select InbredSet.Name, InbredSet.FullName from InbredSet,
+                       Species,
+                       ProbeFreeze, GenoFreeze, PublishFreeze where Species.Name = '%s'
+                       and InbredSet.SpeciesId = Species.Id and
+                       (PublishFreeze.InbredSetId = InbredSet.Id
+                        or GenoFreeze.InbredSetId = InbredSet.Id
+                        or ProbeFreeze.InbredSetId = InbredSet.Id)
+                        group by InbredSet.Name
+                        order by InbredSet.FullName""" % species_name)
+        results = cursor.fetchall()
+        groups[species_name] = list(results)
+    return groups
+
+
+def get_types(groups):
+    """Build types list"""
+    types = {}
+    #print("Groups: ", pf(groups))
+    for species, group_dict in list(groups.items()):
+        types[species] = {}
+        for group_name, _group_full_name in group_dict:
+            # make group an alias to shorten the code
+            #types[species][group_name] = [("Phenotypes", "Phenotypes"), ("Genotypes", "Genotypes")]
+            if phenotypes_exist(group_name):
+                types[species][group_name] = [("Phenotypes", "Phenotypes")]
+            if genotypes_exist(group_name):
+                if group_name in types[species]:
+                    types[species][group_name] += [("Genotypes", "Genotypes")]
+                else:
+                    types[species][group_name] = [("Genotypes", "Genotypes")]
+            if group_name in types[species]:
+                types_list = build_types(species, group_name)
+                if len(types_list) > 0:
+                    types[species][group_name] += types_list
+                else:
+                    if not phenotypes_exist(group_name) and not genotypes_exist(group_name):
+                        types[species].pop(group_name, None)
+                        groups[species] = tuple(
+                            group for group in groups[species] if group[0] != group_name)
+            else:  # ZS: This whole else statement might be unnecessary, need to check
+                types_list = build_types(species, group_name)
+                if len(types_list) > 0:
+                    types[species][group_name] = types_list
+                else:
+                    types[species].pop(group_name, None)
+                    groups[species] = tuple(
+                        group for group in groups[species] if group[0] != group_name)
+    return types
+
+
+def phenotypes_exist(group_name):
+    #print("group_name:", group_name)
+    Cursor.execute("""select Name from PublishFreeze
+                      where PublishFreeze.Name = '%s'""" % (group_name + "Publish"))
+
+    results = Cursor.fetchone()
+    #print("RESULTS:", results)
+
+    if results != None:
+        return True
+    else:
+        return False
+
+
+def genotypes_exist(group_name):
+    #print("group_name:", group_name)
+    Cursor.execute("""select Name from GenoFreeze
+                      where GenoFreeze.Name = '%s'""" % (group_name + "Geno"))
+
+    results = Cursor.fetchone()
+    #print("RESULTS:", results)
+
+    if results != None:
+        return True
+    else:
+        return False
+
+
+def build_types(species, group):
+    """Fetches tissues
+
+    Gets the tissues with data for this species/group
+    (all types except phenotype/genotype are tissues)
+
+    """
+
+    Cursor.execute("""select distinct Tissue.Name
+                       from ProbeFreeze, ProbeSetFreeze, InbredSet, Tissue, Species
+                       where Species.Name = '%s' and Species.Id = InbredSet.SpeciesId and
+                       InbredSet.Name = '%s' and
+                       ProbeFreeze.TissueId = Tissue.Id and
+                       ProbeFreeze.InbredSetId = InbredSet.Id and
+                       ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id and
+                       ProbeSetFreeze.public > 0 and
+                       ProbeSetFreeze.confidentiality < 1
+                       order by Tissue.Name""" % (species, group))
+
+    results = []
+    for result in Cursor.fetchall():
+        if len(result):
+            these_datasets = build_datasets(species, group, result[0])
+            if len(these_datasets) > 0:
+                results.append((result[0], result[0]))
+
+    return results
+
+
+def get_datasets(types):
+    """Build datasets list"""
+    datasets = {}
+    for species, group_dict in list(types.items()):
+        datasets[species] = {}
+        for group, type_list in list(group_dict.items()):
+            datasets[species][group] = {}
+            for type_name in type_list:
+                these_datasets = build_datasets(species, group, type_name[0])
+                if len(these_datasets) > 0:
+                    datasets[species][group][type_name[0]] = these_datasets
+
+    return datasets
+
+
+def build_datasets(species, group, type_name):
+    """Gets dataset names from database"""
+    dataset_text = dataset_value = None
+    datasets = []
+    if type_name == "Phenotypes":
+        Cursor.execute("""select InfoFiles.GN_AccesionId, PublishFreeze.Name, PublishFreeze.FullName from InfoFiles, PublishFreeze, InbredSet where
+                    InbredSet.Name = '%s' and
+                    PublishFreeze.InbredSetId = InbredSet.Id and
+                    InfoFiles.InfoPageName = PublishFreeze.Name order by
+                    PublishFreeze.CreateTime asc""" % group)
+
+        results = Cursor.fetchall()
+        if len(results) > 0:
+            for result in results:
+                print(result)
+                dataset_id = str(result[0])
+                dataset_value = str(result[1])
+                if group == 'MDP':
+                    dataset_text = "Mouse Phenome Database"
+                else:
+                    #dataset_text = "%s Phenotypes" % group
+                    dataset_text = str(result[2])
+                datasets.append((dataset_id, dataset_value, dataset_text))
+        else:
+            dataset_id = "None"
+            dataset_value = "%sPublish" % group
+            dataset_text = "%s Phenotypes" % group
+            datasets.append((dataset_id, dataset_value, dataset_text))
+
+    elif type_name == "Genotypes":
+        Cursor.execute("""select InfoFiles.GN_AccesionId from InfoFiles, GenoFreeze, InbredSet where
+                    InbredSet.Name = '%s' and
+                    GenoFreeze.InbredSetId = InbredSet.Id and
+                    InfoFiles.InfoPageName = GenoFreeze.ShortName and
+                    GenoFreeze.public > 0 and
+                    GenoFreeze.confidentiality < 1 order by
+                    GenoFreeze.CreateTime desc""" % group)
+
+        results = Cursor.fetchone()
+        if results != None:
+            dataset_id = str(results[0])
+        else:
+            dataset_id = "None"
+        dataset_value = "%sGeno" % group
+        dataset_text = "%s Genotypes" % group
+        datasets.append((dataset_id, dataset_value, dataset_text))
+
+    else:  # for mRNA expression/ProbeSet
+        Cursor.execute("""select ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName from
+                    ProbeSetFreeze, ProbeFreeze, InbredSet, Tissue, Species where
+                    Species.Name = '%s' and Species.Id = InbredSet.SpeciesId and
+                    InbredSet.Name = '%s' and
+                    ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id and Tissue.Name = '%s' and
+                    ProbeFreeze.TissueId = Tissue.Id and ProbeFreeze.InbredSetId = InbredSet.Id and
+                    ProbeSetFreeze.confidentiality < 1 and ProbeSetFreeze.public > 0 order by
+                    ProbeSetFreeze.CreateTime desc""" % (species, group, type_name))
+
+        dataset_results = Cursor.fetchall()
+        datasets = []
+        for dataset_info in dataset_results:
+            this_dataset_info = []
+            for info in dataset_info:
+                this_dataset_info.append(str(info))
+            datasets.append(this_dataset_info)
+
+    return datasets
+
+
+def main(cursor):
+    """Generates and outputs (as json file) the data for the main dropdown menus on the home page"""
+
+    species = get_species(cursor)
+    groups = get_groups(cursor, species)
+    types = get_types(groups)
+    datasets = get_datasets(types)
+
+    #species.append(('All Species', 'All Species'))
+    #groups['All Species'] = [('All Groups', 'All Groups')]
+    #types['All Species'] = {}
+    #types['All Species']['All Groups'] = [('Phenotypes', 'Phenotypes')]
+    #datasets['All Species'] = {}
+    #datasets['All Species']['All Groups'] = {}
+    #datasets['All Species']['All Groups']['Phenotypes'] = [('All Phenotypes','All Phenotypes')]
+
+    data = dict(species=species,
+                groups=groups,
+                types=types,
+                datasets=datasets,
+                )
+
+    #print("data:", data)
+
+    output_file = """./wqflask/static/new/javascript/dataset_menu_structure.json"""
+
+    with open(output_file, 'w') as fh:
+        json.dump(data, fh, indent="   ", sort_keys=True)
+
+    #print("\nWrote file to:", output_file)
+
+
+def _test_it():
+    """Used for internal testing only"""
+    types = build_types("Mouse", "BXD")
+    #print("build_types:", pf(types))
+    datasets = build_datasets("Mouse", "BXD", "Hippocampus")
+    #print("build_datasets:", pf(datasets))
+
+
+if __name__ == '__main__':
+    with database_connection(get_setting("SQL_URI")) as conn:
+        with conn.cursor() as cursor:
+            main(cursor)
diff --git a/gn2/maintenance/generate_kinship_from_bimbam.py b/gn2/maintenance/generate_kinship_from_bimbam.py
new file mode 100644
index 00000000..9f01d094
--- /dev/null
+++ b/gn2/maintenance/generate_kinship_from_bimbam.py
@@ -0,0 +1,66 @@
+#!/usr/bin/python
+
+"""
+Generate relatedness matrix files for GEMMA from BIMBAM genotype/phenotype files
+
+This file goes through all of the BIMBAM files in the bimbam diretory
+and uses GEMMA to generate their corresponding kinship/relatedness matrix file
+
+"""
+
+import sys
+sys.path.append("..")
+import os
+import glob
+
+
+class GenerateKinshipMatrices:
+    def __init__(self, group_name, geno_file, pheno_file):
+        self.group_name = group_name
+        self.geno_file = geno_file
+        self.pheno_file = pheno_file
+
+    def generate_kinship(self):
+        gemma_command = "/gnu/store/xhzgjr0jvakxv6h3blj8z496xjig69b0-profile/bin/gemma -g " + self.geno_file + \
+            " -p " + self.pheno_file + \
+            " -gk 1 -outdir /home/zas1024/genotype_files/genotype/bimbam/ -o " + self.group_name
+        print("command:", gemma_command)
+        os.system(gemma_command)
+
+    @classmethod
+    def process_all(self, geno_dir, bimbam_dir):
+        os.chdir(geno_dir)
+        for input_file in glob.glob("*"):
+            if not input_file.endswith(('geno', '.geno.gz')):
+                continue
+            group_name = ".".join(input_file.split('.')[:-1])
+            if group_name == "HSNIH-Palmer":
+                continue
+            geno_input_file = os.path.join(
+                bimbam_dir, group_name + "_geno.txt")
+            pheno_input_file = os.path.join(
+                bimbam_dir, group_name + "_pheno.txt")
+            convertob = GenerateKinshipMatrices(
+                group_name, geno_input_file, pheno_input_file)
+            try:
+                convertob.generate_kinship()
+            except EmptyConfigurations as why:
+                print("  No config info? Continuing...")
+                continue
+            except Exception as why:
+
+                print("  Exception:", why)
+                print(traceback.print_exc())
+                print("    Found in row %s at tabular column %s" % (convertob.latest_row_pos,
+                                                                    convertob.latest_col_pos))
+                print("    Column is:", convertob.latest_col_value)
+                print("    Row is:", convertob.latest_row_value)
+                break
+
+
+if __name__ == "__main__":
+    Geno_Directory = """/export/local/home/zas1024/genotype_files/genotype/"""
+    Bimbam_Directory = """/export/local/home/zas1024/genotype_files/genotype/bimbam/"""
+    GenerateKinshipMatrices.process_all(Geno_Directory, Bimbam_Directory)
+
+    # ./gemma -g /home/zas1024/genotype_files/genotype/bimbam/BXD_geno.txt -p /home/zas1024/genotype_files/genotype/bimbam/BXD_pheno.txt -gk 1 -o BXD
diff --git a/gn2/maintenance/generate_probesetfreeze_file.py b/gn2/maintenance/generate_probesetfreeze_file.py
new file mode 100644
index 00000000..00c2cddf
--- /dev/null
+++ b/gn2/maintenance/generate_probesetfreeze_file.py
@@ -0,0 +1,122 @@
+#!/usr/bin/python
+
+import sys
+
+# sys.path.insert(0, "..") - why?
+
+import os
+import collections
+import csv
+
+from gn2.base import webqtlConfig
+
+from pprint import pformat as pf
+
+from gn2.utility.tools import get_setting
+from gn2.wqflask.database import database_connection
+
+
+def show_progress(process, counter):
+    if counter % 1000 == 0:
+        print("{}: {}".format(process, counter))
+
+
+def get_strains(cursor):
+    cursor.execute("""select Strain.Name
+                      from Strain, StrainXRef, InbredSet
+                      where Strain.Id = StrainXRef.StrainId and
+                            StrainXRef.InbredSetId = InbredSet.Id
+                            and InbredSet.Name=%s;
+                """, "BXD")
+
+    strains = [strain[0] for strain in cursor.fetchall()]
+    print("strains:", pf(strains))
+    for strain in strains:
+        print(" -", strain)
+
+    return strains
+
+
+def get_probeset_vals(cursor, dataset_name):
+    cursor.execute(""" select ProbeSet.Id, ProbeSet.Name
+                from ProbeSetXRef,
+                     ProbeSetFreeze,
+                     ProbeSet
+                where ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and
+                      ProbeSetFreeze.Name = %s and
+                      ProbeSetXRef.ProbeSetId = ProbeSet.Id;
+            """, dataset_name)
+
+    probesets = cursor.fetchall()
+
+    print("Fetched probesets")
+
+    probeset_vals = collections.OrderedDict()
+
+    for counter, probeset in enumerate(probesets):
+        cursor.execute(""" select Strain.Name, ProbeSetData.value
+                       from ProbeSetData, ProbeSetXRef, ProbeSetFreeze, Strain
+                       where ProbeSetData.Id = ProbeSetXRef.DataId
+                       and ProbeSetData.StrainId = Strain.Id
+                       and ProbeSetXRef.ProbeSetId = %s
+                       and ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId
+                       and ProbeSetFreeze.Name = %s;
+                """, (probeset[0], dataset_name))
+        val_dic = collections.OrderedDict()
+        vals = cursor.fetchall()
+        for val in vals:
+            val_dic[val[0]] = val[1]
+
+        probeset_vals[probeset[1]] = val_dic
+        show_progress("Querying DB", counter)
+
+    return probeset_vals
+
+
+def trim_strains(strains, probeset_vals):
+    trimmed_strains = []
+    #print("probeset_vals is:", pf(probeset_vals))
+    first_probeset = list(probeset_vals.values())[0]
+    print("\n**** first_probeset is:", pf(first_probeset))
+    for strain in strains:
+        print("\n**** strain is:", pf(strain))
+        if strain in first_probeset:
+            trimmed_strains.append(strain)
+    print("trimmed_strains:", pf(trimmed_strains))
+    return trimmed_strains
+
+
+def write_data_matrix_file(strains, probeset_vals, filename):
+    with open(filename, "wb") as fh:
+        csv_writer = csv.writer(fh, delimiter=",", quoting=csv.QUOTE_ALL)
+        #print("strains is:", pf(strains))
+        csv_writer.writerow(['ID'] + strains)
+        for counter, probeset in enumerate(probeset_vals):
+            row_data = [probeset]
+            for strain in strains:
+                #print("probeset is: ", pf(probeset_vals[probeset]))
+                row_data.append(probeset_vals[probeset][strain])
+            #print("row_data is: ", pf(row_data))
+            csv_writer.writerow(row_data)
+            show_progress("Writing", counter)
+
+
+def main():
+    filename = os.path.expanduser(
+        "~/gene/wqflask/maintenance/"
+        "ProbeSetFreezeId_210_FullName_Eye_AXBXA_Illumina_V6.2"
+        "(Oct08)_RankInv_Beta.txt")
+    dataset_name = "Eye_AXBXA_1008_RankInv"
+
+    with database_connection(get_setting("SQL_URI")) as conn:
+        with conn.cursor() as cursor:
+            strains = get_strains(cursor)
+            print("Getting probset_vals")
+            probeset_vals = get_probeset_vals(cursor, dataset_name)
+            print("Finished getting probeset_vals")
+            trimmed_strains = trim_strains(strains, probeset_vals)
+            write_data_matrix_file(trimmed_strains, probeset_vals, filename)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/gn2/maintenance/geno_to_json.py b/gn2/maintenance/geno_to_json.py
new file mode 100644
index 00000000..7be2ed83
--- /dev/null
+++ b/gn2/maintenance/geno_to_json.py
@@ -0,0 +1,196 @@
+#!/usr/bin/python
+
+"""
+Convert .geno files to json
+
+This file goes through all of the genofiles in the genofile directory (.geno)
+and converts them to json files that are used when running the marker regression
+code
+
+"""
+
+import sys
+sys.path.append("..")
+import os
+import glob
+import traceback
+import gzip
+
+#import numpy as np
+#from pyLMM import lmm
+
+import simplejson as json
+
+from pprint import pformat as pf
+
+#from gn2.utility.tools import flat_files
+
+
+class EmptyConfigurations(Exception):
+    pass
+
+
+class Marker:
+    def __init__(self):
+        self.name = None
+        self.chr = None
+        self.cM = None
+        self.Mb = None
+        self.genotypes = []
+
+
+class ConvertGenoFile:
+
+    def __init__(self, input_file, output_file):
+
+        self.input_file = input_file
+        self.output_file = output_file
+
+        self.mb_exists = False
+        self.cm_exists = False
+        self.markers = []
+
+        self.latest_row_pos = None
+        self.latest_col_pos = None
+
+        self.latest_row_value = None
+        self.latest_col_value = None
+
+    def convert(self):
+
+        self.haplotype_notation = {
+            '@mat': "1",
+            '@pat': "0",
+            '@het': "0.5",
+            '@unk': "NA"
+        }
+
+        self.configurations = {}
+        #self.skipped_cols = 3
+
+        # if self.input_file.endswith(".geno.gz"):
+        #    print("self.input_file: ", self.input_file)
+        #    self.input_fh = gzip.open(self.input_file)
+        # else:
+        self.input_fh = open(self.input_file)
+
+        with open(self.output_file, "w") as self.output_fh:
+            # if self.file_type == "geno":
+            self.process_csv()
+            # elif self.file_type == "snps":
+            #    self.process_snps_file()
+
+    def process_csv(self):
+        for row_count, row in enumerate(self.process_rows()):
+            row_items = row.split("\t")
+
+            this_marker = Marker()
+            this_marker.name = row_items[1]
+            this_marker.chr = row_items[0]
+            if self.cm_exists and self.mb_exists:
+                this_marker.cM = row_items[2]
+                this_marker.Mb = row_items[3]
+                genotypes = row_items[4:]
+            elif self.cm_exists:
+                this_marker.cM = row_items[2]
+                genotypes = row_items[3:]
+            elif self.mb_exists:
+                this_marker.Mb = row_items[2]
+                genotypes = row_items[3:]
+            else:
+                genotypes = row_items[2:]
+            for item_count, genotype in enumerate(genotypes):
+                if genotype.upper() in self.configurations:
+                    this_marker.genotypes.append(
+                        self.configurations[genotype.upper()])
+                else:
+                    this_marker.genotypes.append("NA")
+
+            #print("this_marker is:", pf(this_marker.__dict__))
+            # if this_marker.chr == "14":
+            self.markers.append(this_marker.__dict__)
+
+        with open(self.output_file, 'w') as fh:
+            json.dump(self.markers, fh, indent="   ", sort_keys=True)
+
+            # print('configurations:', str(configurations))
+            #self.latest_col_pos = item_count + self.skipped_cols
+            #self.latest_col_value = item
+
+            # if item_count != 0:
+            #    self.output_fh.write(" ")
+            # self.output_fh.write(self.configurations[item.upper()])
+
+            # self.output_fh.write("\n")
+
+    def process_rows(self):
+        for self.latest_row_pos, row in enumerate(self.input_fh):
+            # if self.input_file.endswith(".geno.gz"):
+            #    print("row: ", row)
+            self.latest_row_value = row
+            # Take care of headers
+            if not row.strip():
+                continue
+            if row.startswith('#'):
+                continue
+            if row.startswith('Chr'):
+                if 'Mb' in row.split():
+                    self.mb_exists = True
+                if 'cM' in row.split():
+                    self.cm_exists = True
+                continue
+            if row.startswith('@'):
+                key, _separater, value = row.partition(':')
+                key = key.strip()
+                value = value.strip()
+                if key in self.haplotype_notation:
+                    self.configurations[value] = self.haplotype_notation[key]
+                continue
+            if not len(self.configurations):
+                raise EmptyConfigurations
+            yield row
+
+    @classmethod
+    def process_all(cls, old_directory, new_directory):
+        os.chdir(old_directory)
+        for input_file in glob.glob("*"):
+            if not input_file.endswith(('geno', '.geno.gz')):
+                continue
+            group_name = ".".join(input_file.split('.')[:-1])
+            output_file = os.path.join(new_directory, group_name + ".json")
+            print("%s -> %s" % (
+                os.path.join(old_directory, input_file), output_file))
+            convertob = ConvertGenoFile(input_file, output_file)
+            try:
+                convertob.convert()
+            except EmptyConfigurations as why:
+                print("  No config info? Continuing...")
+                #excepted = True
+                continue
+            except Exception as why:
+
+                print("  Exception:", why)
+                print(traceback.print_exc())
+                print("    Found in row %s at tabular column %s" % (convertob.latest_row_pos,
+                                                                    convertob.latest_col_pos))
+                print("    Column is:", convertob.latest_col_value)
+                print("    Row is:", convertob.latest_row_value)
+                break
+
+    # def process_snps_file(cls, snps_file, new_directory):
+    #    output_file = os.path.join(new_directory, "mouse_families.json")
+    #    print("%s -> %s" % (snps_file, output_file))
+    #    convertob = ConvertGenoFile(input_file, output_file)
+
+
+if __name__ == "__main__":
+    Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype"""
+    New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/json"""
+    #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
+    #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
+    #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
+    # convertob.convert()
+    ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
+    # ConvertGenoFiles(Geno_Directory)
+
+    #process_csv(Input_File, Output_File)
diff --git a/gn2/maintenance/get_group_samplelists.py b/gn2/maintenance/get_group_samplelists.py
new file mode 100644
index 00000000..6af637ea
--- /dev/null
+++ b/gn2/maintenance/get_group_samplelists.py
@@ -0,0 +1,47 @@
+import os
+import glob
+import gzip
+
+from gn2.base import webqtlConfig
+
+
+def get_samplelist(file_type, geno_file):
+    if file_type == "geno":
+        return get_samplelist_from_geno(geno_file)
+    elif file_type == "plink":
+        return get_samplelist_from_plink(geno_file)
+
+
+def get_samplelist_from_geno(genofilename):
+    if os.path.isfile(genofilename + '.gz'):
+        genofilename += '.gz'
+        genofile = gzip.open(genofilename)
+    else:
+        genofile = open(genofilename)
+
+    for line in genofile:
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith(("#", "@")):
+            continue
+        break
+
+    headers = line.split("\t")
+
+    if headers[3] == "Mb":
+        samplelist = headers[4:]
+    else:
+        samplelist = headers[3:]
+    return samplelist
+
+
+def get_samplelist_from_plink(genofilename):
+    genofile = open(genofilename)
+
+    samplelist = []
+    for line in genofile:
+        line = line.split(" ")
+        samplelist.append(line[1])
+
+    return samplelist
diff --git a/gn2/maintenance/print_benchmark.py b/gn2/maintenance/print_benchmark.py
new file mode 100644
index 00000000..9d12da8a
--- /dev/null
+++ b/gn2/maintenance/print_benchmark.py
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+
+import time
+
+from pprint import pformat as pf
+
+
+class TheCounter:
+    Counters = {}
+
+    def __init__(self):
+        start_time = time.time()
+        for counter in range(170000):
+            self.print_it(counter)
+        self.time_took = time.time() - start_time
+        TheCounter.Counters[self.__class__.__name__] = self.time_took
+
+
+class PrintAll(TheCounter):
+    def print_it(self, counter):
+        print(counter)
+
+
+class PrintSome(TheCounter):
+    def print_it(self, counter):
+        if counter % 1000 == 0:
+            print(counter)
+
+
+class PrintNone(TheCounter):
+    def print_it(self, counter):
+        pass
+
+
+def new_main():
+    print("Running new_main")
+    tests = [PrintAll, PrintSome, PrintNone]
+    for test in tests:
+        test()
+
+    print(pf(TheCounter.Counters))
+
+
+if __name__ == '__main__':
+    new_main()
diff --git a/gn2/maintenance/quantile_normalize.py b/gn2/maintenance/quantile_normalize.py
new file mode 100644
index 00000000..5620b552
--- /dev/null
+++ b/gn2/maintenance/quantile_normalize.py
@@ -0,0 +1,98 @@
+import sys
+sys.path.insert(0, './')
+import urllib.parse
+
+import numpy as np
+import pandas as pd
+
+from flask import Flask, g, request
+
+from gn2.wqflask import app
+from gn2.wqflask.database import database_connection
+from gn2.utility.tools import get_setting
+
+
+def create_dataframe(input_file):
+    with open(input_file) as f:
+        ncols = len(f.readline().split("\t"))
+
+    input_array = np.loadtxt(open(
+        input_file, "rb"), delimiter="\t", skiprows=1, usecols=list(range(1, ncols)))
+    return pd.DataFrame(input_array)
+
+# This function taken from https://github.com/ShawnLYU/Quantile_Normalize
+
+
+def quantileNormalize(df_input):
+    df = df_input.copy()
+    # compute rank
+    dic = {}
+    for col in df:
+        dic.update({col: sorted(df[col])})
+    sorted_df = pd.DataFrame(dic)
+    rank = sorted_df.mean(axis=1).tolist()
+    # sort
+    for col in df:
+        t = np.searchsorted(np.sort(df[col]), df[col])
+        df[col] = [rank[i] for i in t]
+    return df
+
+
+def set_data(cursor, dataset_name):
+    orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt"
+
+    sample_list = []
+    with open(orig_file, 'r') as orig_fh, open('/home/zas1024/cfw_data/quant_norm.csv', 'r') as quant_fh:
+        for i, (line1, line2) in enumerate(zip(orig_fh, quant_fh)):
+            trait_dict = {}
+            sample_list = []
+            if i == 0:
+                sample_names = line1.split('\t')[1:]
+            else:
+                trait_name = line1.split('\t')[0]
+                for i, sample in enumerate(sample_names):
+                    this_sample = {
+                        "name": sample,
+                        "value": line1.split('\t')[i + 1],
+                        "qnorm": line2.split('\t')[i + 1]
+                    }
+                    sample_list.append(this_sample)
+                query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName
+                           FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet
+                           WHERE Species.Id = InbredSet.SpeciesId and
+                                 InbredSet.Id = ProbeFreeze.InbredSetId and
+                                 ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId and
+                                 ProbeSetFreeze.Name = '%s' and
+                                 ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId and
+                                 ProbeSetXRef.ProbeSetId = ProbeSet.Id and
+                                 ProbeSet.Name = '%s'""" % (dataset_name, line1.split('\t')[0])
+                cursor.execute(query)
+                result_info = cursor.fetchone()
+
+                yield {
+                    "_index": "traits",
+                    "_type": "trait",
+                    "_source": {
+                        "name": trait_name,
+                        "species": result_info[0],
+                        "group": result_info[1],
+                        "dataset": dataset_name,
+                        "dataset_fullname": result_info[2],
+                        "samples": sample_list,
+                        "transform_types": "qnorm"
+                    }
+                }
+
+
+if __name__ == '__main__':
+    with database_connection(get_setting("SQL_URI")) as conn:
+        with conn.cursor() as cursor:
+            success, _ = bulk(es, set_data(cursor, sys.argv[1]))
+
+            response = es.search(
+                index="traits", doc_type="trait", body={
+                    "query": {"match": {"name": "ENSMUSG00000028982"}}
+                }
+            )
+
+            print(response)
diff --git a/gn2/maintenance/set_resource_defaults.py b/gn2/maintenance/set_resource_defaults.py
new file mode 100644
index 00000000..f9e5494a
--- /dev/null
+++ b/gn2/maintenance/set_resource_defaults.py
@@ -0,0 +1,153 @@
+"""
+
+Script that sets default resource access masks for use with the DB proxy
+
+Defaults will be:
+Owner - omni_gn
+Mask  - Public/non-confidential: { data: "view",
+                                   metadata: "view",
+                                   admin: "not-admin" }
+        Private/confidentia:     { data: "no-access",
+                                   metadata: "no-access",
+                                   admin: "not-admin" }
+
+To run:
+./bin/genenetwork2 ~/my_settings.py -c ./wqflask/maintenance/gen_select_dataset.py
+
+"""
+
+import sys
+import json
+
+# NEW: Note we prepend the current path - otherwise a guix instance of GN2 may be used instead
+sys.path.insert(0, './')
+
+# NEW: import app to avoid a circular dependency on utility.tools
+from gn2.wqflask import app
+
+from gn2.utility import hmac
+from gn2.utility.tools import get_setting
+from gn2.utility.redis_tools import get_redis_conn, get_user_id, add_resource, get_resources, get_resource_info
+Redis = get_redis_conn()
+
+import urllib.parse
+
+from gn2.wqflask.database import database_connection
+
+
+def insert_probeset_resources(cursor, default_owner_id):
+    current_resources = Redis.hgetall("resources")
+    cursor.execute("""  SELECT
+                            ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.confidentiality, ProbeSetFreeze.public
+                        FROM
+                            ProbeSetFreeze""")
+
+    resource_results = cursor.fetchall()
+    for i, resource in enumerate(resource_results):
+        resource_ob = {}
+        resource_ob['name'] = resource[1]
+        resource_ob['owner_id'] = default_owner_id
+        resource_ob['data'] = {"dataset": str(resource[0])}
+        resource_ob['type'] = "dataset-probeset"
+        if resource[2] < 1 and resource[3] > 0:
+            resource_ob['default_mask'] = {"data": "view",
+                                           "metadata": "view",
+                                           "admin": "not-admin"}
+        else:
+            resource_ob['default_mask'] = {"data": "no-access",
+                                           "metadata": "no-access",
+                                           "admin": "not-admin"}
+        resource_ob['group_masks'] = {}
+
+        add_resource(resource_ob, update=False)
+
+
+def insert_publish_resources(cursor, default_owner_id):
+    current_resources = Redis.hgetall("resources")
+    cursor.execute("""  SELECT 
+                            PublishXRef.Id, PublishFreeze.Id, InbredSet.InbredSetCode
+                        FROM
+                            PublishXRef, PublishFreeze, InbredSet, Publication
+                        WHERE
+                            PublishFreeze.InbredSetId = PublishXRef.InbredSetId AND
+                            InbredSet.Id = PublishXRef.InbredSetId AND
+                            Publication.Id = PublishXRef.PublicationId""")
+
+    resource_results = cursor.fetchall()
+    for resource in resource_results:
+        if resource[2]:
+            resource_ob = {}
+            if resource[2]:
+                resource_ob['name'] = resource[2] + "_" + str(resource[0])
+            else:
+                resource_ob['name'] = str(resource[0])
+            resource_ob['owner_id'] = default_owner_id
+            resource_ob['data'] = {"dataset": str(resource[1]),
+                                   "trait": str(resource[0])}
+            resource_ob['type'] = "dataset-publish"
+            resource_ob['default_mask'] = {"data": "view",
+                                           "metadata": "view",
+                                           "admin": "not-admin"}
+
+            resource_ob['group_masks'] = {}
+
+            add_resource(resource_ob, update=False)
+        else:
+            continue
+
+
+def insert_geno_resources(cursor, default_owner_id):
+    current_resources = Redis.hgetall("resources")
+    cursor.execute("""  SELECT
+                            GenoFreeze.Id, GenoFreeze.ShortName, GenoFreeze.confidentiality
+                        FROM
+                            GenoFreeze""")
+
+    resource_results = cursor.fetchall()
+    for i, resource in enumerate(resource_results):
+        resource_ob = {}
+        resource_ob['name'] = resource[1]
+        if resource[1] == "HET3-ITPGeno":
+            resource_ob['owner_id'] = "c5ce8c56-78a6-474f-bcaf-7129d97f56ae"
+        else:
+            resource_ob['owner_id'] = default_owner_id
+        resource_ob['data'] = {"dataset": str(resource[0])}
+        resource_ob['type'] = "dataset-geno"
+        if resource[2] < 1:
+            resource_ob['default_mask'] = {"data": "view",
+                                           "metadata": "view",
+                                           "admin": "not-admin"}
+        else:
+            resource_ob['default_mask'] = {"data": "no-access",
+                                           "metadata": "no-access",
+                                           "admin": "not-admin"}
+        resource_ob['group_masks'] = {}
+
+        add_resource(resource_ob, update=False)
+
+
+def insert_resources(default_owner_id):
+    current_resources = get_resources()
+    print("START")
+    insert_publish_resources(cursor, default_owner_id)
+    print("AFTER PUBLISH")
+    insert_geno_resources(cursor, default_owner_id)
+    print("AFTER GENO")
+    insert_probeset_resources(cursor, default_owner_id)
+    print("AFTER PROBESET")
+
+
+def main(cursor):
+    """Generates and outputs (as json file) the data for the main dropdown menus on the home page"""
+
+    Redis.delete("resources")
+
+    owner_id = "c5ce8c56-78a6-474f-bcaf-7129d97f56ae"
+
+    insert_resources(owner_id)
+
+
+if __name__ == '__main__':
+    with database_connection(get_setting("SQL_URI")) as conn:
+        with conn.cursor() as cursor:
+            main(cursor)
author	Arun Isaac	2023-12-29 18:55:37 +0000
committer	Arun Isaac	2023-12-29 19:01:46 +0000
commit	204a308be0f741726b9a620d88fbc22b22124c81 (patch)
tree	b3cf66906674020b530c844c2bb4982c8a0e2d39 /gn2/maintenance
parent	83062c75442160427b50420161bfcae2c5c34c84 (diff)
download	genenetwork2-204a308be0f741726b9a620d88fbc22b22124c81.tar.gz