3 files changed, 669 insertions, 0 deletions
diff --git a/wqflask/maintenance/generate_probesetfreeze_file.py b/wqflask/maintenance/generate_probesetfreeze_file.py
new file mode 100644
index 00000000..91a2b8a1
--- /dev/null
+++ b/wqflask/maintenance/generate_probesetfreeze_file.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+
+from __future__ import absolute_import, print_function, division
+
+import sys
+
+sys.path.insert(0, "..")
+
+import os
+import collections
+import csv
+
+import MySQLdb
+
+from base import webqtlConfig
+
+from pprint import pformat as pf
+
+
+def get_cursor():
+    con = MySQLdb.Connect(db=webqtlConfig.DB_UPDNAME,
+                          host=webqtlConfig.MYSQL_UPDSERVER,
+                          user=webqtlConfig.DB_UPDUSER,
+                          passwd=webqtlConfig.DB_UPDPASSWD)
+    cursor = con.cursor()
+    return cursor
+
+def show_progress(process, counter):
+    if counter % 1000 == 0:
+        print("{}: {}".format(process, counter))
+
+def get_strains(cursor):
+    cursor.execute("""select Strain.Name
+                      from Strain, StrainXRef, InbredSet
+                      where Strain.Id = StrainXRef.StrainId and
+                            StrainXRef.InbredSetId = InbredSet.Id
+                            and InbredSet.Name=%s;
+                """, "BXD")
+
+    strains = [strain[0] for strain in cursor.fetchall()]
+    print("strains:", pf(strains))
+    for strain in strains:
+        print(" -", strain)
+
+    return strains
+
+def get_probeset_vals(cursor, dataset_name):
+    cursor.execute(""" select ProbeSet.Id, ProbeSet.Name
+                from ProbeSetXRef,
+                     ProbeSetFreeze,
+                     ProbeSet
+                where ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and
+                      ProbeSetFreeze.Name = %s and
+                      ProbeSetXRef.ProbeSetId = ProbeSet.Id;
+            """, dataset_name)
+
+    probesets = cursor.fetchall()
+
+    print("Fetched probesets")
+
+    probeset_vals = collections.OrderedDict()
+
+    for counter, probeset in enumerate(probesets):
+        cursor.execute(""" select Strain.Name, ProbeSetData.value
+                       from ProbeSetData, ProbeSetXRef, ProbeSetFreeze, Strain
+                       where ProbeSetData.Id = ProbeSetXRef.DataId
+                       and ProbeSetData.StrainId = Strain.Id
+                       and ProbeSetXRef.ProbeSetId = %s
+                       and ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId
+                       and ProbeSetFreeze.Name = %s;
+                """, (probeset[0], dataset_name))
+        val_dic = collections.OrderedDict()
+        vals = cursor.fetchall()
+        for val in vals:
+            val_dic[val[0]] = val[1]
+
+        probeset_vals[probeset[1]] = val_dic
+        show_progress("Querying DB", counter)
+
+    return probeset_vals
+
+def trim_strains(strains, probeset_vals):
+    trimmed_strains = []
+    #print("probeset_vals is:", pf(probeset_vals))
+    first_probeset = list(probeset_vals.itervalues())[0]
+    print("\n**** first_probeset is:", pf(first_probeset))
+    for strain in strains:
+        print("\n**** strain is:", pf(strain))
+        if strain in first_probeset:
+            trimmed_strains.append(strain)
+    print("trimmed_strains:", pf(trimmed_strains))
+    return trimmed_strains
+
+def write_data_matrix_file(strains, probeset_vals, filename):
+    with open(filename, "wb") as fh:
+        csv_writer = csv.writer(fh, delimiter=",", quoting=csv.QUOTE_ALL)
+        #print("strains is:", pf(strains))
+        csv_writer.writerow(['ID'] + strains)
+        for counter, probeset in enumerate(probeset_vals):
+            row_data = [probeset]
+            for strain in strains:
+                #print("probeset is: ", pf(probeset_vals[probeset]))
+                row_data.append(probeset_vals[probeset][strain])
+            #print("row_data is: ", pf(row_data))
+            csv_writer.writerow(row_data)
+            show_progress("Writing", counter)
+
+def main():
+    filename = os.path.expanduser("~/gene/wqflask/maintenance/" +
+                "ProbeSetFreezeId_210_FullName_Eye_AXBXA_Illumina_V6.2" + 
+                "(Oct08)_RankInv_Beta.txt")
+    dataset_name = "Eye_AXBXA_1008_RankInv"
+
+    cursor = get_cursor()
+    strains = get_strains(cursor)
+    print("Getting probset_vals")
+    probeset_vals = get_probeset_vals(cursor, dataset_name)
+    print("Finished getting probeset_vals")
+    trimmed_strains = trim_strains(strains, probeset_vals)
+    write_data_matrix_file(trimmed_strains, probeset_vals, filename)
+
+if __name__ == '__main__':
+    main()
diff --git a/wqflask/maintenance/print_benchmark.py b/wqflask/maintenance/print_benchmark.py
new file mode 100644
index 00000000..540e0904
--- /dev/null
+++ b/wqflask/maintenance/print_benchmark.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python
+
+from __future__ import absolute_import, print_function, division
+
+import time
+
+from pprint import pformat as pf
+
+
+class TheCounter(object):
+    Counters = {}
+    
+    def __init__(self):
+        start_time = time.time()
+        for counter in range(170000):
+            self.print_it(counter)
+        self.time_took = time.time() - start_time
+        TheCounter.Counters[self.__class__.__name__] = self.time_took        
+
+class PrintAll(TheCounter): 
+    def print_it(self, counter):
+        print(counter)
+        
+class PrintSome(TheCounter):
+    def print_it(self, counter):
+        if counter % 1000 == 0:
+            print(counter)
+            
+class PrintNone(TheCounter):
+    def print_it(self, counter):
+        pass
+    
+    
+def new_main():
+    print("Running new_main")
+    tests = [PrintAll, PrintSome, PrintNone]
+    for test in tests:
+        test()
+        
+    print(pf(TheCounter.Counters))
+
+if __name__ == '__main__':
+    new_main()
+\ No newline at end of file
diff --git a/wqflask/maintenance/quick_search_table.py b/wqflask/maintenance/quick_search_table.py
new file mode 100644
index 00000000..b07e7656
--- /dev/null
+++ b/wqflask/maintenance/quick_search_table.py
@@ -0,0 +1,503 @@
+"""Creates a table used for the quick search feature.
+
+One column contains the terms to match the user's search against. Another contains the result
+fields in json format
+
+Results will be returned for each of several trait types: mRNA assays, phenotypes, genotypes, and
+(maybe later) genes
+
+For each trait type, the results for each species should be given This table will then list
+each trait, its dataset, and several columns determined by its trait type (phenotype, genotype, etc)
+
+"""
+
+from __future__ import print_function, division, absolute_import
+
+import sys
+sys.path.append("../../..")
+
+import simplejson as json
+
+import sqlalchemy as sa
+from sqlalchemy.orm import scoped_session, sessionmaker
+from sqlalchemy.ext.declarative import declarative_base
+
+#from pprint import pformat as pf
+
+import zach_settings as settings
+
+Engine = sa.create_engine(settings.SQLALCHEMY_DATABASE_URI,
+                       #encoding='utf-8',
+                       #client_encoding='utf-8',
+                       #echo="debug",w
+                       )
+
+Session = scoped_session(sessionmaker(bind=Engine)) #, extension=VersionedListener()))
+
+Base = declarative_base(bind=Engine)
+Metadata = sa.MetaData()
+Metadata.bind = Engine
+
+class PublishXRef(Base):
+    """Class that corresponds with the PublishXRef table in the database.
+    
+    The PublishXRef table links phenotype traits and their publications.
+    
+    This class is used to add phenotype traits to the quick search table.
+    
+    """
+    
+    __tablename__ = 'PublishXRef'
+
+    Id = sa.Column(sa.Integer, primary_key=True)
+    InbredSetId = sa.Column(sa.Integer, primary_key=True)
+    PhenotypeId = sa.Column(sa.Integer)
+    PublicationId = sa.Column(sa.Integer)
+    DataId = sa.Column(sa.Integer)
+    Locus = sa.Column(sa.Text)
+    LRS = sa.Column(sa.Float)
+    additive = sa.Column(sa.Float)
+    Sequence = sa.Column(sa.Integer)
+    comments = sa.Column(sa.Text)
+
+    @classmethod
+    def run(cls):
+        """Connects to database and inserts phenotype trait info into the Quicksearch table."""
+        conn = Engine.connect()
+        counter = 0
+        for pub_row in page_query(Session.query(cls)):   #all()
+            values = {}
+            values['table_name'] = cls.__tablename__
+            values['the_key'] = json.dumps([pub_row.Id, pub_row.InbredSetId])
+            values['terms'] = cls.get_unique_terms(pub_row.Id, pub_row.InbredSetId)
+            print("terms is:", values['terms'])
+            values['result_fields'] = cls.get_result_fields(pub_row.Id, pub_row.InbredSetId)
+            ins = QuickSearch.insert().values(**values)
+            conn.execute(ins)
+            counter += 1
+            print("Done:", counter)
+
+    @staticmethod
+    def get_unique_terms(publishxref_id, inbredset_id):
+        """Finds unique terms for each item in the PublishXRef table to match a query against"""
+        results = Session.query(
+                "pre_publication_description",
+                "post_publication_description",
+                "pre_publication_abbreviation",
+                "post_publication_abbreviation",
+                "publication_title"
+            ).from_statement(
+                "SELECT Phenotype.Pre_publication_description as pre_publication_description, "
+                "Phenotype.Post_publication_description as post_publication_description, "
+                "Phenotype.Pre_publication_abbreviation as pre_publication_abbreviation, "
+                "Phenotype.Post_publication_abbreviation as post_publication_abbreviation, "
+                "Publication.Title as publication_title "
+                "FROM Phenotype, Publication, PublishXRef "
+                "WHERE PublishXRef.Id = :publishxref_id and "
+                "PublishXRef.InbredSetId = :inbredset_id and "
+                "PublishXRef.PhenotypeId = Phenotype.Id and "
+                "PublishXRef.PublicationId = Publication.Id ").params(publishxref_id=publishxref_id,
+                                                            inbredset_id=inbredset_id).all()
+
+        unique = set()
+        for item in results[0]:
+            #print("locals:", locals())
+            if not item:
+                continue
+            for token in item.split():
+                if token.startswith(('(','[')):
+                    token = token[1:]
+                if token.endswith((')', ']')):
+                    token = token[:-1]
+                if token.endswith(';'):
+                    token = token[:-1]
+                if len(token) > 2:
+                    try:
+                        # This hopefully ensures that the token is utf-8
+                        token = token.encode('utf-8')
+                        print(" ->", token)
+                    except UnicodeDecodeError:
+                        print("\n-- UDE \n")
+                        # Can't get it into utf-8, we won't use it
+                        continue 
+
+                    unique.add(token)
+        #print("\nUnique terms are: {}\n".format(unique))
+        return " ".join(unique)            
+
+    @staticmethod
+    def get_result_fields(publishxref_id, inbredset_id):
+        """Gets the result fields (columns) that appear on the result page as a json string"""
+        results = Session.query(
+                "phenotype_id",
+                "species",
+                "group_name",
+                "description",
+                "lrs",
+                "publication_id",
+                "pubmed_id",
+                "year",
+                "authors"
+            ).from_statement(
+                "SELECT PublishXRef.PhenotypeId as phenotype_id, "
+                "Species.Name as species, "
+                "InbredSet.Name as group_name, "
+                "Phenotype.Original_description as description, "
+                "PublishXRef.LRS as lrs, "
+                "PublishXRef.PublicationId as publication_id, "
+                "Publication.PubMed_ID as pubmed_id, "
+                "Publication.Year as year, "
+                "Publication.Authors as authors "
+                "FROM PublishXRef, "
+                "Phenotype, "
+                "Publication, "
+                "InbredSet, "
+                "Species "
+                "WHERE PublishXRef.Id = :publishxref_id and "
+                "PublishXRef.InbredSetId = :inbredset_id and "
+                "PublishXRef.PhenotypeId = Phenotype.Id and "
+                "PublishXRef.PublicationId = Publication.Id and "
+                "InbredSet.Id = :inbredset_id and "
+                "Species.Id = InbredSet.SpeciesId ").params(publishxref_id=publishxref_id,
+                                                            inbredset_id=inbredset_id).all()
+
+        assert len(set(result for result in results)) == 1, "Different results or no results"
+
+        result = results[0]
+        result = row2dict(result)
+        try:
+            json_results = json.dumps(result, sort_keys=True)
+        except UnicodeDecodeError:
+            print("\n\nTrying to massage unicode\n\n")
+            for key, value in result.iteritems():
+                print("\tkey is:", key)
+                print("\tvalue is:", value)
+                if isinstance(value, basestring):
+                    result[key] = value.decode('utf-8', errors='ignore')
+            json_results = json.dumps(result, sort_keys=True)
+
+        return json_results
+
+
+class GenoXRef(Base):
+    """Class that corresponds with the GenoXRef table in the database.
+    
+    The GenoXRef table links genotype traits and their data.
+    
+    This class is used to add genotype traits to the quick search table.
+    
+    """
+    
+    __tablename__ = 'GenoXRef'
+    
+    GenoFreezeId = sa.Column(sa.Integer, primary_key=True)
+    GenoId = sa.Column(sa.Integer, primary_key=True)
+    DataId = sa.Column(sa.Integer)
+    cM = sa.Column(sa.Float)
+    Used_for_mapping = sa.Column(sa.Text)
+
+    @classmethod
+    def run(cls):
+        """Connects to database and inserts genotype trait info into the Quicksearch table."""
+        conn = Engine.connect()
+        counter = 0
+        for item in page_query(Session.query(cls)):   #all()
+            values = {}
+            values['table_name'] = cls.__tablename__
+            values['the_key'] = json.dumps([item.GenoId, item.GenoFreezeId])
+            values['terms'] = cls.get_unique_terms(item.GenoId)
+            print("terms is:", values['terms'])
+            if values['terms']:
+                values['result_fields'] = cls.get_result_fields(item.GenoId, item.GenoFreezeId)
+                ins = QuickSearch.insert().values(**values)
+                conn.execute(ins)
+            counter += 1
+            print("Done:", counter)
+
+    @staticmethod
+    def get_unique_terms(geno_id):
+        """Finds unique terms for each item in the GenoXRef table to match a query against"""
+        print("geno_id: ", geno_id)
+        results = Session.query(
+                "name",
+                "marker_name"
+            ).from_statement(
+                "SELECT Geno.Name as name, "
+                "Geno.Marker_Name as marker_name "
+                "FROM Geno "
+                "WHERE Geno.Id = :geno_id ").params(geno_id=geno_id).all()
+        
+        unique = set()
+        if len(results):
+            for item in results[0]:
+                #print("locals:", locals())
+                if not item:
+                    continue
+                for token in item.split():
+                    if len(token) > 2:
+                        try:
+                            # This hopefully ensures that the token is utf-8
+                            token = token.encode('utf-8')
+                            print(" ->", token)
+                        except UnicodeDecodeError:
+                            print("\n-- UDE \n")
+                            # Can't get it into utf-8, we won't use it
+                            continue 
+                        
+                        unique.add(token)
+        return " ".join(unique)
+
+
+    @staticmethod
+    def get_result_fields(geno_id, dataset_id):
+        """Gets the result fields (columns) that appear on the result page as a json string"""
+        results = Session.query(
+                "name",
+                "marker_name",
+                "group_name",
+                "species",
+                "dataset",
+                "dataset_name",
+                "chr", "mb",
+                "source"
+            ).from_statement(
+                "SELECT Geno.Name as name, "
+                "Geno.Marker_Name as marker_name, "
+                "InbredSet.Name as group_name, "
+                "Species.Name as species, "
+                "GenoFreeze.Name as dataset, "
+                "GenoFreeze.FullName as dataset_name, "
+                "Geno.Chr as chr, "
+                "Geno.Mb as mb, "
+                "Geno.Source as source "
+                "FROM Geno, "
+                "GenoXRef, "
+                "GenoFreeze, "
+                "InbredSet, "
+                "Species "
+                "WHERE Geno.Id = :geno_id and "
+                "GenoXRef.GenoId = Geno.Id and "
+                "GenoFreeze.Id = :dataset_id and "
+                "GenoXRef.GenoFreezeId = GenoFreeze.Id and "
+                "InbredSet.Id = GenoFreeze.InbredSetId and "
+                "InbredSet.SpeciesId = Species.Id ").params(geno_id=geno_id,
+                                                                    dataset_id=dataset_id).all()
+        assert len(set(result for result in results)) == 1, "Different results"
+        
+        result = results[0]
+        result = row2dict(result)
+        try:
+            json_results = json.dumps(result, sort_keys=True)
+        except UnicodeDecodeError:
+            print("\n\nTrying to massage unicode\n\n")
+            for key, value in result.iteritems():
+                print("\tkey is:", key)
+                print("\tvalue is:", value)
+                if isinstance(value, basestring):
+                    result[key] = value.decode('utf-8', errors='ignore')
+            json_results = json.dumps(result, sort_keys=True)
+
+        return json_results    
+
+class ProbeSetXRef(Base):
+    """Class that corresponds with the ProbeSetXRef table in the database.
+    
+    The ProbeSetXRef table links mRNA expression traits and their sample data.
+    
+    This class is used to add mRNA expression traits to the quick search table.
+    
+    """
+    
+    __tablename__ = 'ProbeSetXRef'
+    
+    ProbeSetFreezeId = sa.Column(sa.Integer, primary_key=True)
+    ProbeSetId = sa.Column(sa.Integer, primary_key=True)
+    DataId = sa.Column(sa.Integer, unique=True)
+    Locus_old = sa.Column(sa.Text)
+    LRS_old = sa.Column(sa.Float)
+    pValue_old = sa.Column(sa.Float)
+    mean = sa.Column(sa.Float)
+    se = sa.Column(sa.Float)
+    Locus = sa.Column(sa.Text)
+    LRS = sa.Column(sa.Float)
+    pValue = sa.Column(sa.Float)
+    additive = sa.Column(sa.Float)
+    h2 = sa.Column(sa.Float)
+
+    @classmethod
+    def run(cls):
+        """Connects to db and inserts mRNA expression trait info into the Quicksearch table."""
+        conn = Engine.connect()
+        counter = 0
+        for ps_row in page_query(Session.query(cls)):   #all()
+            values = {}
+            values['table_name'] = cls.__tablename__
+            values['the_key'] = json.dumps([ps_row.ProbeSetId, ps_row.ProbeSetFreezeId])
+            values['terms'] = cls.get_unique_terms(ps_row.ProbeSetId)
+            print("terms is:", values['terms'])
+            values['result_fields'] = cls.get_result_fields(ps_row.ProbeSetId,
+                                                            ps_row.ProbeSetFreezeId)
+            if values['result_fields'] == None:
+                continue
+            ins = QuickSearch.insert().values(**values)
+            conn.execute(ins)
+            counter += 1
+            print("Done:", counter)
+    
+    @staticmethod
+    def get_unique_terms(probeset_id):
+        """Finds unique terms for each item in the ProbeSetXRef table to match a query against"""
+        results = Session.query(
+                "name",
+                "symbol",
+                "description",
+                "alias"
+            ).from_statement(
+                "SELECT ProbeSet.Name as name, "
+                "ProbeSet.Symbol as symbol, "
+                "ProbeSet.description as description, "
+                "ProbeSet.alias as alias "
+                "FROM ProbeSet "
+                "WHERE ProbeSet.Id = :probeset_id ").params(probeset_id=probeset_id).all()
+
+        unique = set()
+        if len(results):
+            for item in results[0]:
+                if not item:
+                    continue
+                for token in item.split():
+                    if token.startswith(('(','[')):
+                        token = token[1:]
+                    if token.endswith((')', ']')):
+                        token = token[:-1]
+                    if token.endswith(';'):
+                        token = token[:-1]
+                    if len(token) > 2:
+                        try:
+                            # This hopefully ensures that the token is utf-8
+                            token = token.encode('utf-8')
+                            print(" ->", token)
+                        except UnicodeDecodeError:
+                            print("\n-- UDE \n")
+                            # Can't get it into utf-8, we won't use it
+                            continue 
+                        
+                        unique.add(token)
+            return " ".join(unique)
+
+
+    @staticmethod
+    def get_result_fields(probeset_id, dataset_id):
+        """Gets the result fields (columns) that appear on the result page as a json string"""
+        print("probeset_id: ", probeset_id)
+        print("dataset_id: ", dataset_id)
+        results = Session.query(
+                "name",
+                "species",
+                "group_name",
+                "dataset",
+                "dataset_name",
+                "symbol",
+                "description",
+                "chr", "mb",
+                "lrs", "mean",
+                "genbank_id",
+                "gene_id",
+                "chip_id",
+                "chip_name"
+            ).from_statement(
+                "SELECT ProbeSet.Name as name, "
+                "Species.Name as species, "
+                "InbredSet.Name as group_name, "
+                "ProbeSetFreeze.Name as dataset, "
+                "ProbeSetFreeze.FullName as dataset_name, "
+                "ProbeSet.Symbol as symbol, "
+                "ProbeSet.description as description, "
+                "ProbeSet.Chr as chr, "
+                "ProbeSet.Mb as mb, "
+                "ProbeSetXRef.LRS as lrs, "
+                "ProbeSetXRef.mean as mean, "
+                "ProbeSet.GenbankId as genbank_id, "
+                "ProbeSet.GeneId as gene_id, "
+                "ProbeSet.ChipId as chip_id, "
+                "GeneChip.Name as chip_name "
+                "FROM ProbeSet, "
+                "ProbeSetXRef, "
+                "ProbeSetFreeze, "
+                "ProbeFreeze, "
+                "InbredSet, "
+                "Species, "
+                "GeneChip "
+                "WHERE ProbeSetXRef.ProbeSetId = :probeset_id and "
+                "ProbeSetXRef.ProbeSetFreezeId = :dataset_id and "
+                "ProbeSetXRef.ProbeSetId = ProbeSet.Id and "
+                "ProbeSet.ChipId = GeneChip.Id and "
+                "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and "
+                "ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id and "
+                "ProbeFreeze.InbredSetId = InbredSet.Id and "
+                "InbredSet.SpeciesId = Species.Id ").params(probeset_id=probeset_id,
+                                                                    dataset_id=dataset_id).all()
+            
+        if len(set(result for result in results)) != 1:
+            return None
+
+        result = results[0]
+        result = row2dict(result)
+        try:
+            json_results = json.dumps(result, sort_keys=True)
+        except UnicodeDecodeError:
+            print("\n\nTrying to massage unicode\n\n")
+            for key, value in result.iteritems():
+                print("\tkey is:", key)
+                print("\tvalue is:", value)
+                if isinstance(value, basestring):
+                    result[key] = value.decode('utf-8', errors='ignore')
+            json_results = json.dumps(result, sort_keys=True)
+
+        return json_results    
+
+QuickSearch = sa.Table("QuickSearch", Metadata,
+        # table_name is the table that item is inserted from
+        sa.Column('table_name', sa.String(15),
+                  primary_key=True, nullable=False, autoincrement=False), 
+        sa.Column('the_key', sa.String(30),
+                  primary_key=True, nullable=False, autoincrement=False), # key in database table
+        sa.Column('terms', sa.Text), # terms to compare search string with
+        sa.Column('result_fields', sa.Text),  # json
+        mysql_engine = 'MyISAM',
+                    )
+
+#QuickSearch.drop(Engine, checkfirst=True)
+#Metadata.create_all(Engine)
+
+
+def row2dict(row):
+    """From http://stackoverflow.com/a/2848519/1175849"""
+    return dict(zip(row.keys(), row))
+
+
+def page_query(query):
+    """From http://stackoverflow.com/a/1217947/1175849"""
+    offset = 0
+    while True:
+        rrr = False
+        for elem in query.limit(1000).offset(offset):
+            rrr = True
+            yield elem
+        offset += 1000
+        if not rrr:
+            break
+
+
+def main():
+    """Populate the QuickSearch table that is used with the quick search features.
+    
+    Add all items from the ProbeSetXRef, GenoXRef, and PublishXRef tables to the QuickSearch tables.
+    
+    """
+    ProbeSetXRef.run()
+    #GenoXRef.run()
+    #PublishXRef.run()
+
+if __name__ == "__main__":
+    main()
+\ No newline at end of file