Merge pull request #1 from genenetwork/testing

Updating my testing branch
author: acenteno 2020-04-21 17:35:34 -0500
committer: GitHub 2020-04-21 17:35:34 -0500
commit: 660589b9c2a507529e8e51ca6ce66ca97ad982c5 (patch)
tree: 27f63957278581bc2fce2b88744bfe20c8a81558 /wqflask/maintenance
parent: d97fdc18359233f07c1a1c7b83fe7e88eb225043 (diff)
parent: f2a3ae13231a7d270a5bb6911c248aa713f1ef91 (diff)
download: genenetwork2-660589b9c2a507529e8e51ca6ce66ca97ad982c5.tar.gz
7 files changed, 703 insertions, 55 deletions
diff --git a/wqflask/maintenance/convert_dryad_to_bimbam.py b/wqflask/maintenance/convert_dryad_to_bimbam.py
new file mode 100644
index 00000000..e833b395
--- /dev/null
+++ b/wqflask/maintenance/convert_dryad_to_bimbam.py
@@ -0,0 +1,70 @@
+#!/usr/bin/python
+
+"""
+Convert data dryad files to a BIMBAM _geno and _snps file
+
+
+"""
+
+from __future__ import print_function, division, absolute_import
+import sys
+sys.path.append("..")
+
+
+def read_dryad_file(filename):
+    exclude_count = 0
+    marker_list = []
+    sample_dict = {}
+    sample_list = []
+    geno_rows = []
+    with open(filename, 'r') as the_file:
+        for i, line in enumerate(the_file):
+            if i > 0:
+                if line.split(" ")[1] == "no":
+                    sample_name = line.split(" ")[0]
+                    sample_list.append(sample_name)
+                    sample_dict[sample_name] = line.split(" ")[2:]
+                else:
+                    exclude_count += 1
+            else:
+                marker_list = line.split(" ")[2:]
+
+    for i, marker in enumerate(marker_list):
+        this_row = []
+        this_row.append(marker)
+        this_row.append("X")
+        this_row.append("Y")
+        for sample in sample_list:
+            this_row.append(sample_dict[sample][i])
+        geno_rows.append(this_row)
+
+    print(exclude_count)
+
+    return geno_rows
+
+    #for i, marker in enumerate(marker_list):
+    #    this_row = []
+    #    this_row.append(marker)
+    #    this_row.append("X")
+    #    this_row.append("Y")
+    #    with open(filename, 'r') as the_file:
+    #        for j, line in enumerate(the_file):
+    #            if j > 0:
+    #                this_row.append(line.split(" ")[i+2])
+    #        print("row: " + str(i))
+    #        geno_rows.append(this_row)
+    #            
+    #return geno_rows
+
+def write_bimbam_files(geno_rows):
+    with open('/home/zas1024/cfw_data/CFW_geno.txt', 'w') as geno_fh:
+        for row in geno_rows:
+            geno_fh.write(", ".join(row) + "\n")
+
+def convert_dryad_to_bimbam(filename):
+    geno_file_rows = read_dryad_file(filename)
+    write_bimbam_files(geno_file_rows)
+
+if __name__=="__main__":
+    input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt"
+    convert_dryad_to_bimbam(input_filename)
+\ No newline at end of file
diff --git a/wqflask/maintenance/convert_geno_to_bimbam.py b/wqflask/maintenance/convert_geno_to_bimbam.py
new file mode 100644
index 00000000..528b98cf
--- /dev/null
+++ b/wqflask/maintenance/convert_geno_to_bimbam.py
@@ -0,0 +1,190 @@
+#!/usr/bin/python
+
+"""
+Convert .geno files to json
+
+This file goes through all of the genofiles in the genofile directory (.geno)
+and converts them to json files that are used when running the marker regression
+code
+
+"""
+
+from __future__ import print_function, division, absolute_import
+import sys
+sys.path.append("..")
+import os
+import glob
+import traceback
+import gzip
+
+import simplejson as json
+
+from pprint import pformat as pf
+
+class EmptyConfigurations(Exception): pass
+
+class Marker(object):
+    def __init__(self):
+        self.name = None
+        self.chr = None
+        self.cM = None
+        self.Mb = None
+        self.genotypes = []
+
+class ConvertGenoFile(object):
+
+    def __init__(self, input_file, output_files):
+        self.input_file = input_file
+        self.output_files = output_files
+
+        self.mb_exists = False
+        self.cm_exists = False
+        self.markers = []
+
+        self.latest_row_pos = None
+        self.latest_col_pos = None
+
+        self.latest_row_value = None
+        self.latest_col_value = None
+
+    def convert(self):
+        self.haplotype_notation = {
+            '@mat': "1",
+            '@pat': "0",
+            '@het': "0.5",
+            '@unk': "NA"
+            }
+
+        self.configurations = {}
+        self.input_fh = open(self.input_file)
+
+        self.process_csv()
+
+    def process_csv(self):
+        for row in self.process_rows():
+            row_items = row.split("\t")
+
+            this_marker = Marker()
+            this_marker.name = row_items[1]
+            this_marker.chr = row_items[0]
+            if self.cm_exists and self.mb_exists:
+                this_marker.cM = row_items[2]
+                this_marker.Mb = row_items[3]
+                genotypes = row_items[4:]
+            elif self.cm_exists:
+                this_marker.cM = row_items[2]
+                genotypes = row_items[3:]
+            elif self.mb_exists:
+                this_marker.Mb = row_items[2]
+                genotypes = row_items[3:]
+            else:
+                genotypes = row_items[2:]
+            for item_count, genotype in enumerate(genotypes):
+                if genotype.upper().strip() in self.configurations:
+                    this_marker.genotypes.append(self.configurations[genotype.upper().strip()])
+                else:
+                    this_marker.genotypes.append("NA")
+
+            self.markers.append(this_marker.__dict__)
+
+        self.write_to_bimbam()    
+
+    def write_to_bimbam(self):
+        with open(self.output_files[0], "w") as geno_fh:
+            for marker in self.markers:
+                geno_fh.write(marker['name'])
+                geno_fh.write(", X, Y")
+                geno_fh.write(", " + ", ".join(marker['genotypes']))
+                geno_fh.write("\n")
+
+        with open(self.output_files[1], "w") as pheno_fh:
+            for sample in self.sample_list:
+                pheno_fh.write("1\n")
+
+        with open(self.output_files[2], "w") as snp_fh:
+            for marker in self.markers:
+                if self.mb_exists:
+                    snp_fh.write(marker['name'] +", " + str(int(float(marker['Mb'])*1000000)) + ", " + marker['chr'] + "\n")
+                else:
+                    snp_fh.write(marker['name'] +", " + str(int(float(marker['cM'])*1000000)) + ", " + marker['chr'] + "\n")
+
+    def get_sample_list(self, row_contents):
+        self.sample_list = []
+        if self.mb_exists:
+            if self.cm_exists:
+                self.sample_list = row_contents[4:]
+            else:
+                self.sample_list = row_contents[3:]
+        else:
+            if self.cm_exists:
+                self.sample_list = row_contents[3:]
+            else:
+                self.sample_list = row_contents[2:]
+    
+    def process_rows(self):
+        for self.latest_row_pos, row in enumerate(self.input_fh):
+            self.latest_row_value = row
+            # Take care of headers
+            if not row.strip():
+                continue
+            if row.startswith('#'):
+                continue
+            if row.startswith('Chr'):
+                if 'Mb' in row.split():
+                    self.mb_exists = True
+                if 'cM' in row.split():
+                    self.cm_exists = True
+                self.get_sample_list(row.split())
+                continue
+            if row.startswith('@'):
+                key, _separater, value = row.partition(':')
+                key = key.strip()
+                value = value.strip()
+                if key == "@filler":
+                    raise EmptyConfigurations
+                if key in self.haplotype_notation:
+                    self.configurations[value] = self.haplotype_notation[key]
+                continue
+            if not len(self.configurations):
+                raise EmptyConfigurations
+            yield row
+
+    @classmethod
+    def process_all(cls, old_directory, new_directory):
+        os.chdir(old_directory)
+        for input_file in glob.glob("*"):
+            if not input_file.endswith(('geno', '.geno.gz')):
+                continue
+            group_name = ".".join(input_file.split('.')[:-1])
+            if group_name == "HSNIH-Palmer":
+                continue
+            geno_output_file = os.path.join(new_directory, group_name + "_geno.txt")
+            pheno_output_file = os.path.join(new_directory, group_name + "_pheno.txt")
+            snp_output_file = os.path.join(new_directory, group_name + "_snps.txt")
+            output_files = [geno_output_file, pheno_output_file, snp_output_file]
+            print("%s -> %s" % (
+                os.path.join(old_directory, input_file), geno_output_file))
+            convertob = ConvertGenoFile(input_file, output_files)
+            try:
+                convertob.convert()
+            except EmptyConfigurations as why:
+                print("  No config info? Continuing...")
+                continue
+            except Exception as why:
+                print("  Exception:", why)
+                print(traceback.print_exc())
+                print("    Found in row %s at tabular column %s" % (convertob.latest_row_pos,
+                                                                convertob.latest_col_pos))
+                print("    Column is:", convertob.latest_col_value)
+                print("    Row is:", convertob.latest_row_value)
+                break
+
+if __name__=="__main__":
+    Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype"""
+    New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/bimbam"""
+    #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
+    #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
+    #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
+    #convertob.convert()
+    ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
+    #ConvertGenoFiles(Geno_Directory)
+\ No newline at end of file
diff --git a/wqflask/maintenance/gen_select_dataset.py b/wqflask/maintenance/gen_select_dataset.py
index e5726656..647e58a2 100644
--- a/wqflask/maintenance/gen_select_dataset.py
+++ b/wqflask/maintenance/gen_select_dataset.py
@@ -57,16 +57,13 @@ import urlparse
 
 from pprint import pformat as pf
 
-#Engine = sa.create_engine(zach_settings.SQLALCHEMY_DATABASE_URI)
+#Engine = sa.create_engine(zach_settings.SQL_URI)
 
 # build MySql database connection
 
 #conn = Engine.connect()
 
-print('WARNING: This conversion is now OBSOLETE as the menu gets built from the database in Javascript using GN_SERVER instead!')
-
-
-def parse_db_uri(db_uri):
+def parse_db_uri():
     """Converts a database URI to the db name, host name, user name, and password"""
 
     parsed_uri = urlparse.urlparse(SQL_URI)
@@ -81,10 +78,10 @@ def parse_db_uri(db_uri):
     return db_conn_info
 
 
-
 def get_species():
     """Build species list"""
-    Cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId")
+    #Cursor.execute("select Name, MenuName from Species where Species.Name != 'macaque monkey' order by OrderId")
+    Cursor.execute("select Name, MenuName from Species order by OrderId")
     species = list(Cursor.fetchall())
     return species
 
@@ -96,13 +93,14 @@ def get_groups(species):
         Cursor.execute("""select InbredSet.Name, InbredSet.FullName from InbredSet,
                        Species,
                        ProbeFreeze, GenoFreeze, PublishFreeze where Species.Name = '%s'
-                       and InbredSet.SpeciesId = Species.Id and InbredSet.Name != 'BXD300' and
+                       and InbredSet.SpeciesId = Species.Id and
                        (PublishFreeze.InbredSetId = InbredSet.Id
                         or GenoFreeze.InbredSetId = InbredSet.Id
                         or ProbeFreeze.InbredSetId = InbredSet.Id)
                         group by InbredSet.Name
-                        order by InbredSet.Name""" % species_name)
-        groups[species_name] = list(Cursor.fetchall())
+                        order by InbredSet.FullName""" % species_name)
+        results = Cursor.fetchall()
+        groups[species_name] = list(results)
     return groups
 
 
@@ -123,10 +121,20 @@ def get_types(groups):
                 else:
                     types[species][group_name] = [("Genotypes", "Genotypes")]
             if group_name in types[species]:
-                types[species][group_name] += build_types(species, group_name)
-            else:
-                types[species][group_name] = build_types(species, group_name)
-
+                types_list = build_types(species, group_name)
+                if len(types_list) > 0:
+                    types[species][group_name] += types_list
+                else:
+                    if not phenotypes_exist(group_name) and not genotypes_exist(group_name):
+                        types[species].pop(group_name, None)
+                        groups[species] = tuple(group for group in groups[species] if group[0] != group_name)
+            else: #ZS: This whole else statement might be unnecessary, need to check
+                types_list = build_types(species, group_name)
+                if len(types_list) > 0:
+                    types[species][group_name] = types_list
+                else:
+                    types[species].pop(group_name, None)
+                    groups[species] = tuple(group for group in groups[species] if group[0] != group_name)
     return types
 
 
@@ -190,7 +198,6 @@ def get_datasets(types):
     for species, group_dict in types.iteritems():
         datasets[species] = {}
         for group, type_list in group_dict.iteritems():
-            #print("type_list: ", type_list)
             datasets[species][group] = {}
             for type_name in type_list:
                 these_datasets = build_datasets(species, group, type_name[0])
@@ -203,26 +210,31 @@ def get_datasets(types):
 def build_datasets(species, group, type_name):
     """Gets dataset names from database"""
     dataset_text = dataset_value = None
+    datasets = []
     if type_name == "Phenotypes":
-        print("GROUP:", group)
-        Cursor.execute("""select InfoFiles.GN_AccesionId from InfoFiles, PublishFreeze, InbredSet where
+        Cursor.execute("""select InfoFiles.GN_AccesionId, PublishFreeze.Name, PublishFreeze.FullName from InfoFiles, PublishFreeze, InbredSet where
                     InbredSet.Name = '%s' and
                     PublishFreeze.InbredSetId = InbredSet.Id and
-                    InfoFiles.InfoPageName = PublishFreeze.Name and
-                    PublishFreeze.public > 0 and
-                    PublishFreeze.confidentiality < 1 order by
-                    PublishFreeze.CreateTime desc""" % group)
-
-        results = Cursor.fetchone()
-        if results != None:
-            dataset_id = str(results[0])
+                    InfoFiles.InfoPageName = PublishFreeze.Name order by
+                    PublishFreeze.CreateTime asc""" % group)
+
+        results = Cursor.fetchall()
+        if len(results) > 0:
+            for result in results:
+                print(result)
+                dataset_id = str(result[0])
+                dataset_value = str(result[1])
+                if group == 'MDP':
+                    dataset_text = "Mouse Phenome Database"
+                else:
+                    #dataset_text = "%s Phenotypes" % group
+                    dataset_text = str(result[2])
+                datasets.append((dataset_id, dataset_value, dataset_text))
         else:
             dataset_id = "None"
-        dataset_value = "%sPublish" % group
-        if group == 'MDP':
-            dataset_text = "Mouse Phenome Database"
-        else:
-            dataset_text = "%s Published Phenotypes" % group
+            dataset_value = "%sPublish" % group
+            dataset_text = "%s Phenotypes" % group
+            datasets.append((dataset_id, dataset_value, dataset_text))
 
     elif type_name == "Genotypes":
         Cursor.execute("""select InfoFiles.GN_AccesionId from InfoFiles, GenoFreeze, InbredSet where
@@ -240,10 +252,9 @@ def build_datasets(species, group, type_name):
             dataset_id = "None"
         dataset_value = "%sGeno" % group
         dataset_text = "%s Genotypes" % group
+        datasets.append((dataset_id, dataset_value, dataset_text))
 
-    if dataset_value:
-        return [(dataset_id, dataset_value, dataset_text)]
-    else:
+    else: # for mRNA expression/ProbeSet
         Cursor.execute("""select ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName from
                     ProbeSetFreeze, ProbeFreeze, InbredSet, Tissue, Species where
                     Species.Name = '%s' and Species.Id = InbredSet.SpeciesId and
@@ -261,26 +272,26 @@ def build_datasets(species, group, type_name):
                 this_dataset_info.append(str(info))
             datasets.append(this_dataset_info)
 
-        return datasets
+    return datasets
 
 
 def main():
     """Generates and outputs (as json file) the data for the main dropdown menus on the home page"""
 
-    parse_db_uri(SQL_URI)
+    parse_db_uri()
 
     species = get_species()
     groups = get_groups(species)
     types = get_types(groups)
     datasets = get_datasets(types)
 
-    species.append(('All Species', 'All Species'))
-    groups['All Species'] = [('All Groups', 'All Groups')]
-    types['All Species'] = {}
-    types['All Species']['All Groups'] = [('Phenotypes', 'Phenotypes')]
-    datasets['All Species'] = {}
-    datasets['All Species']['All Groups'] = {}
-    datasets['All Species']['All Groups']['Phenotypes'] = [('All Phenotypes','All Phenotypes')]
+    #species.append(('All Species', 'All Species'))
+    #groups['All Species'] = [('All Groups', 'All Groups')]
+    #types['All Species'] = {}
+    #types['All Species']['All Groups'] = [('Phenotypes', 'Phenotypes')]
+    #datasets['All Species'] = {}
+    #datasets['All Species']['All Groups'] = {}
+    #datasets['All Species']['All Groups']['Phenotypes'] = [('All Phenotypes','All Phenotypes')]
 
     data = dict(species=species,
                 groups=groups,
@@ -306,6 +317,6 @@ def _test_it():
     #print("build_datasets:", pf(datasets))
 
 if __name__ == '__main__':
-    Conn = MySQLdb.Connect(**parse_db_uri(SQL_URI))
+    Conn = MySQLdb.Connect(**parse_db_uri())
     Cursor = Conn.cursor()
-    main()
+    main()
+\ No newline at end of file
diff --git a/wqflask/maintenance/generate_kinship_from_bimbam.py b/wqflask/maintenance/generate_kinship_from_bimbam.py
new file mode 100644
index 00000000..b53f5dda
--- /dev/null
+++ b/wqflask/maintenance/generate_kinship_from_bimbam.py
@@ -0,0 +1,61 @@
+#!/usr/bin/python
+
+"""
+Generate relatedness matrix files for GEMMA from BIMBAM genotype/phenotype files
+
+This file goes through all of the BIMBAM files in the bimbam diretory
+and uses GEMMA to generate their corresponding kinship/relatedness matrix file
+
+"""
+
+from __future__ import print_function, division, absolute_import
+import sys
+sys.path.append("..")
+import os
+import glob
+
+class GenerateKinshipMatrices(object):
+    def __init__(self, group_name, geno_file, pheno_file):
+        self.group_name = group_name
+        self.geno_file = geno_file
+        self.pheno_file = pheno_file
+    
+    def generate_kinship(self):
+        gemma_command = "/gnu/store/xhzgjr0jvakxv6h3blj8z496xjig69b0-profile/bin/gemma -g " + self.geno_file + " -p " + self.pheno_file + " -gk 1 -outdir /home/zas1024/genotype_files/genotype/bimbam/ -o " + self.group_name
+        print("command:", gemma_command)
+        os.system(gemma_command)
+
+    @classmethod
+    def process_all(self, geno_dir, bimbam_dir):
+        os.chdir(geno_dir)
+        for input_file in glob.glob("*"):
+            if not input_file.endswith(('geno', '.geno.gz')):
+                continue
+            group_name = ".".join(input_file.split('.')[:-1])
+            if group_name == "HSNIH-Palmer":
+                continue
+            geno_input_file = os.path.join(bimbam_dir, group_name + "_geno.txt")
+            pheno_input_file = os.path.join(bimbam_dir, group_name + "_pheno.txt")
+            convertob = GenerateKinshipMatrices(group_name, geno_input_file, pheno_input_file)
+            try:
+                convertob.generate_kinship()
+            except EmptyConfigurations as why:
+                print("  No config info? Continuing...")
+                continue
+            except Exception as why:
+
+                print("  Exception:", why)
+                print(traceback.print_exc())
+                print("    Found in row %s at tabular column %s" % (convertob.latest_row_pos,
+                                                                convertob.latest_col_pos))
+                print("    Column is:", convertob.latest_col_value)
+                print("    Row is:", convertob.latest_row_value)
+                break
+    
+    
+if __name__=="__main__":
+    Geno_Directory = """/export/local/home/zas1024/genotype_files/genotype/"""
+    Bimbam_Directory = """/export/local/home/zas1024/genotype_files/genotype/bimbam/"""
+    GenerateKinshipMatrices.process_all(Geno_Directory, Bimbam_Directory)
+    
+    #./gemma -g /home/zas1024/genotype_files/genotype/bimbam/BXD_geno.txt -p /home/zas1024/genotype_files/genotype/bimbam/BXD_pheno.txt -gk 1 -o BXD
+\ No newline at end of file
diff --git a/wqflask/maintenance/geno_to_json.py b/wqflask/maintenance/geno_to_json.py
new file mode 100644
index 00000000..9579812a
--- /dev/null
+++ b/wqflask/maintenance/geno_to_json.py
@@ -0,0 +1,197 @@
+#!/usr/bin/python
+
+"""
+Convert .geno files to json
+
+This file goes through all of the genofiles in the genofile directory (.geno)
+and converts them to json files that are used when running the marker regression
+code
+
+"""
+
+from __future__ import print_function, division, absolute_import
+import sys
+sys.path.append("..")
+import os
+import glob
+import traceback
+import gzip
+
+#import numpy as np
+#from pyLMM import lmm
+
+import simplejson as json
+
+from pprint import pformat as pf
+
+#from utility.tools import flat_files
+
+class EmptyConfigurations(Exception): pass
+
+        
+
+class Marker(object):
+    def __init__(self):
+        self.name = None
+        self.chr = None
+        self.cM = None
+        self.Mb = None
+        self.genotypes = []
+
+class ConvertGenoFile(object):
+
+    def __init__(self, input_file, output_file):
+        
+        self.input_file = input_file
+        self.output_file = output_file
+        
+        self.mb_exists = False
+        self.cm_exists = False
+        self.markers = []
+        
+        self.latest_row_pos = None
+        self.latest_col_pos = None
+        
+        self.latest_row_value = None
+        self.latest_col_value = None
+        
+    def convert(self):
+
+        self.haplotype_notation = {
+            '@mat': "1",
+            '@pat': "0",
+            '@het': "0.5",
+            '@unk': "NA"
+            }
+        
+        self.configurations = {}
+        #self.skipped_cols = 3
+        
+        #if self.input_file.endswith(".geno.gz"):
+        #    print("self.input_file: ", self.input_file)
+        #    self.input_fh = gzip.open(self.input_file)
+        #else:
+        self.input_fh = open(self.input_file)
+        
+        with open(self.output_file, "w") as self.output_fh:
+            #if self.file_type == "geno":
+            self.process_csv()
+            #elif self.file_type == "snps":
+            #    self.process_snps_file()
+
+
+    def process_csv(self):
+        for row_count, row in enumerate(self.process_rows()):
+            row_items = row.split("\t")
+
+            this_marker = Marker()
+            this_marker.name = row_items[1]
+            this_marker.chr = row_items[0]
+            if self.cm_exists and self.mb_exists:
+                this_marker.cM = row_items[2]
+                this_marker.Mb = row_items[3]
+                genotypes = row_items[4:]
+            elif self.cm_exists:
+                this_marker.cM = row_items[2]
+                genotypes = row_items[3:]
+            elif self.mb_exists:
+                this_marker.Mb = row_items[2]
+                genotypes = row_items[3:]
+            else:
+                genotypes = row_items[2:]
+            for item_count, genotype in enumerate(genotypes):
+                if genotype.upper() in self.configurations:
+                    this_marker.genotypes.append(self.configurations[genotype.upper()])
+                else:
+                    this_marker.genotypes.append("NA")
+                
+            #print("this_marker is:", pf(this_marker.__dict__))   
+            #if this_marker.chr == "14":
+            self.markers.append(this_marker.__dict__)
+
+        with open(self.output_file, 'w') as fh:
+            json.dump(self.markers, fh, indent="   ", sort_keys=True)
+                
+                # print('configurations:', str(configurations))
+                #self.latest_col_pos = item_count + self.skipped_cols
+                #self.latest_col_value = item
+                
+                #if item_count != 0:
+                #    self.output_fh.write(" ")
+                #self.output_fh.write(self.configurations[item.upper()])
+                    
+            #self.output_fh.write("\n")
+
+
+    def process_rows(self):
+        for self.latest_row_pos, row in enumerate(self.input_fh):
+            #if self.input_file.endswith(".geno.gz"):
+            #    print("row: ", row)
+            self.latest_row_value = row
+            # Take care of headers
+            if not row.strip():
+                continue
+            if row.startswith('#'):
+                continue
+            if row.startswith('Chr'):
+                if 'Mb' in row.split():
+                    self.mb_exists = True
+                if 'cM' in row.split():
+                    self.cm_exists = True
+                continue
+            if row.startswith('@'):
+                key, _separater, value = row.partition(':')
+                key = key.strip()
+                value = value.strip()
+                if key in self.haplotype_notation:
+                    self.configurations[value] = self.haplotype_notation[key]
+                continue
+            if not len(self.configurations):
+                raise EmptyConfigurations
+            yield row
+
+    @classmethod
+    def process_all(cls, old_directory, new_directory):
+        os.chdir(old_directory)
+        for input_file in glob.glob("*"):
+            if not input_file.endswith(('geno', '.geno.gz')):
+                continue
+            group_name = ".".join(input_file.split('.')[:-1])
+            output_file = os.path.join(new_directory, group_name + ".json")
+            print("%s -> %s" % (
+                os.path.join(old_directory, input_file), output_file))
+            convertob = ConvertGenoFile(input_file, output_file)
+            try:
+                convertob.convert()
+            except EmptyConfigurations as why:
+                print("  No config info? Continuing...")
+                #excepted = True
+                continue
+            except Exception as why:
+
+                print("  Exception:", why)
+                print(traceback.print_exc())
+                print("    Found in row %s at tabular column %s" % (convertob.latest_row_pos,
+                                                                convertob.latest_col_pos))
+                print("    Column is:", convertob.latest_col_value)
+                print("    Row is:", convertob.latest_row_value)
+                break
+            
+    #def process_snps_file(cls, snps_file, new_directory):
+    #    output_file = os.path.join(new_directory, "mouse_families.json")
+    #    print("%s -> %s" % (snps_file, output_file))
+    #    convertob = ConvertGenoFile(input_file, output_file)
+        
+
+
+if __name__=="__main__":
+    Old_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype"""
+    New_Geno_Directory = """/export/local/home/zas1024/gn2-zach/genotype_files/genotype/json"""
+    #Input_File = """/home/zas1024/gene/genotype_files/genotypes/BXD.geno"""
+    #Output_File = """/home/zas1024/gene/wqflask/wqflask/pylmm/data/bxd.snps"""
+    #convertob = ConvertGenoFile("/home/zas1024/gene/genotype_files/genotypes/SRxSHRSPF2.geno", "/home/zas1024/gene/genotype_files/new_genotypes/SRxSHRSPF2.json")
+    #convertob.convert()
+    ConvertGenoFile.process_all(Old_Geno_Directory, New_Geno_Directory)
+    #ConvertGenoFiles(Geno_Directory)
+    
+    #process_csv(Input_File, Output_File)
+\ No newline at end of file
diff --git a/wqflask/maintenance/get_group_samplelists.py b/wqflask/maintenance/get_group_samplelists.py
index 04e94886..fb22898a 100644
--- a/wqflask/maintenance/get_group_samplelists.py
+++ b/wqflask/maintenance/get_group_samplelists.py
@@ -6,16 +6,6 @@ import gzip
 
 from base import webqtlConfig
 
-def process_genofiles(geno_dir=webqtlConfig.GENODIR):
-    print("Yabba")
-    #sys.exit("Dabba")
-    os.chdir(geno_dir)
-    for geno_file in glob.glob("*"):
-        if geno_file.lower().endswith(('.geno', '.geno.gz')):
-            #group_name = genofilename.split('.')[0]
-            sample_list = get_samplelist(geno_file)
-
-
 def get_samplelist(file_type, geno_file):
     if file_type == "geno":
         return get_samplelist_from_geno(geno_file)
@@ -37,7 +27,7 @@ def get_samplelist_from_geno(genofilename):
             continue
         break
 
-    headers = line.split()
+    headers = line.split("\t")
 
     if headers[3] == "Mb":
         samplelist = headers[4:]
diff --git a/wqflask/maintenance/quantile_normalize.py b/wqflask/maintenance/quantile_normalize.py
new file mode 100644
index 00000000..41a3aad8
--- /dev/null
+++ b/wqflask/maintenance/quantile_normalize.py
@@ -0,0 +1,129 @@
+from __future__ import absolute_import, print_function, division
+
+import sys
+sys.path.insert(0,'./')
+
+from itertools import izip
+
+import MySQLdb
+import urlparse
+
+import numpy as np
+import pandas as pd
+from elasticsearch import Elasticsearch, TransportError
+from elasticsearch.helpers import bulk
+
+from flask import Flask, g, request
+
+from wqflask import app
+from utility.elasticsearch_tools import get_elasticsearch_connection
+from utility.tools import ELASTICSEARCH_HOST, ELASTICSEARCH_PORT, SQL_URI
+
+def parse_db_uri():
+    """Converts a database URI to the db name, host name, user name, and password"""
+
+    parsed_uri = urlparse.urlparse(SQL_URI)
+
+    db_conn_info = dict(
+                        db = parsed_uri.path[1:],
+                        host = parsed_uri.hostname,
+                        user = parsed_uri.username,
+                        passwd = parsed_uri.password)
+
+    print(db_conn_info)
+    return db_conn_info
+
+def create_dataframe(input_file):
+    with open(input_file) as f:
+        ncols = len(f.readline().split("\t"))
+
+    input_array = np.loadtxt(open(input_file, "rb"), delimiter="\t", skiprows=1, usecols=range(1, ncols))
+    return pd.DataFrame(input_array)
+
+#This function taken from https://github.com/ShawnLYU/Quantile_Normalize
+def quantileNormalize(df_input):
+    df = df_input.copy()
+    #compute rank
+    dic = {}
+    for col in df:
+        dic.update({col : sorted(df[col])})
+    sorted_df = pd.DataFrame(dic)
+    rank = sorted_df.mean(axis = 1).tolist()
+    #sort
+    for col in df:
+        t = np.searchsorted(np.sort(df[col]), df[col])
+        df[col] = [rank[i] for i in t]
+    return df
+
+def set_data(dataset_name):
+    orig_file = "/home/zas1024/cfw_data/" + dataset_name + ".txt"
+
+    sample_list = []
+    with open(orig_file, 'r') as orig_fh, open('/home/zas1024/cfw_data/quant_norm.csv', 'r') as quant_fh:
+        for i, (line1, line2) in enumerate(izip(orig_fh, quant_fh)):
+            trait_dict = {}
+            sample_list = []
+            if i == 0:
+                sample_names = line1.split('\t')[1:]
+            else:
+                trait_name = line1.split('\t')[0]
+                for i, sample in enumerate(sample_names):
+                    this_sample = {
+                                    "name": sample,
+                                    "value": line1.split('\t')[i+1],
+                                    "qnorm": line2.split('\t')[i+1]
+                                  }
+                    sample_list.append(this_sample)
+                query = """SELECT Species.SpeciesName, InbredSet.InbredSetName, ProbeSetFreeze.FullName
+                           FROM Species, InbredSet, ProbeSetFreeze, ProbeFreeze, ProbeSetXRef, ProbeSet
+                           WHERE Species.Id = InbredSet.SpeciesId and
+                                 InbredSet.Id = ProbeFreeze.InbredSetId and
+                                 ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId and
+                                 ProbeSetFreeze.Name = '%s' and
+                                 ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId and
+                                 ProbeSetXRef.ProbeSetId = ProbeSet.Id and
+                                 ProbeSet.Name = '%s'""" % (dataset_name, line1.split('\t')[0])
+                Cursor.execute(query)
+                result_info = Cursor.fetchone()
+
+                yield {
+                    "_index": "traits",
+                    "_type": "trait",
+                    "_source": {
+                        "name": trait_name,
+                        "species": result_info[0],
+                        "group": result_info[1],
+                        "dataset": dataset_name,
+                        "dataset_fullname": result_info[2],
+                        "samples": sample_list,
+                        "transform_types": "qnorm"
+                    }
+                }
+
+if __name__ == '__main__':
+    Conn = MySQLdb.Connect(**parse_db_uri())
+    Cursor = Conn.cursor()
+
+    #es = Elasticsearch([{
+    #    "host": ELASTICSEARCH_HOST, "port": ELASTICSEARCH_PORT
+    #}], timeout=60) if (ELASTICSEARCH_HOST and ELASTICSEARCH_PORT) else None
+
+    es = get_elasticsearch_connection(for_user=False)
+
+    #input_filename = "/home/zas1024/cfw_data/" + sys.argv[1] + ".txt"
+    #input_df = create_dataframe(input_filename)
+    #output_df = quantileNormalize(input_df)
+
+    #output_df.to_csv('quant_norm.csv', sep='\t')
+
+    #out_filename = sys.argv[1][:-4] + '_quantnorm.txt'
+
+    success, _ = bulk(es, set_data(sys.argv[1]))
+
+    response = es.search(
+        index = "traits", doc_type = "trait", body = {
+            "query": { "match": { "name": "ENSMUSG00000028982" } }
+        }
+    )
+
+    print(response)
+\ No newline at end of file
author	acenteno	2020-04-21 17:35:34 -0500
committer	GitHub	2020-04-21 17:35:34 -0500
commit	660589b9c2a507529e8e51ca6ce66ca97ad982c5 (patch)
tree	27f63957278581bc2fce2b88744bfe20c8a81558 /wqflask/maintenance
parent	d97fdc18359233f07c1a1c7b83fe7e88eb225043 (diff)
parent	f2a3ae13231a7d270a5bb6911c248aa713f1ef91 (diff)
download	genenetwork2-660589b9c2a507529e8e51ca6ce66ca97ad982c5.tar.gz