diff options
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r-- | wqflask/maintenance/generate_probesetfreeze_file.py | 123 | ||||
-rw-r--r-- | wqflask/maintenance/print_benchmark.py | 43 | ||||
-rw-r--r-- | wqflask/maintenance/quick_search_table.py | 503 |
3 files changed, 669 insertions, 0 deletions
diff --git a/wqflask/maintenance/generate_probesetfreeze_file.py b/wqflask/maintenance/generate_probesetfreeze_file.py new file mode 100644 index 00000000..91a2b8a1 --- /dev/null +++ b/wqflask/maintenance/generate_probesetfreeze_file.py @@ -0,0 +1,123 @@ +#!/usr/bin/python + +from __future__ import absolute_import, print_function, division + +import sys + +sys.path.insert(0, "..") + +import os +import collections +import csv + +import MySQLdb + +from base import webqtlConfig + +from pprint import pformat as pf + + +def get_cursor(): + con = MySQLdb.Connect(db=webqtlConfig.DB_UPDNAME, + host=webqtlConfig.MYSQL_UPDSERVER, + user=webqtlConfig.DB_UPDUSER, + passwd=webqtlConfig.DB_UPDPASSWD) + cursor = con.cursor() + return cursor + +def show_progress(process, counter): + if counter % 1000 == 0: + print("{}: {}".format(process, counter)) + +def get_strains(cursor): + cursor.execute("""select Strain.Name + from Strain, StrainXRef, InbredSet + where Strain.Id = StrainXRef.StrainId and + StrainXRef.InbredSetId = InbredSet.Id + and InbredSet.Name=%s; + """, "BXD") + + strains = [strain[0] for strain in cursor.fetchall()] + print("strains:", pf(strains)) + for strain in strains: + print(" -", strain) + + return strains + +def get_probeset_vals(cursor, dataset_name): + cursor.execute(""" select ProbeSet.Id, ProbeSet.Name + from ProbeSetXRef, + ProbeSetFreeze, + ProbeSet + where ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and + ProbeSetFreeze.Name = %s and + ProbeSetXRef.ProbeSetId = ProbeSet.Id; + """, dataset_name) + + probesets = cursor.fetchall() + + print("Fetched probesets") + + probeset_vals = collections.OrderedDict() + + for counter, probeset in enumerate(probesets): + cursor.execute(""" select Strain.Name, ProbeSetData.value + from ProbeSetData, ProbeSetXRef, ProbeSetFreeze, Strain + where ProbeSetData.Id = ProbeSetXRef.DataId + and ProbeSetData.StrainId = Strain.Id + and ProbeSetXRef.ProbeSetId = %s + and ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId + and ProbeSetFreeze.Name = %s; + """, (probeset[0], dataset_name)) + val_dic = collections.OrderedDict() + vals = cursor.fetchall() + for val in vals: + val_dic[val[0]] = val[1] + + probeset_vals[probeset[1]] = val_dic + show_progress("Querying DB", counter) + + return probeset_vals + +def trim_strains(strains, probeset_vals): + trimmed_strains = [] + #print("probeset_vals is:", pf(probeset_vals)) + first_probeset = list(probeset_vals.itervalues())[0] + print("\n**** first_probeset is:", pf(first_probeset)) + for strain in strains: + print("\n**** strain is:", pf(strain)) + if strain in first_probeset: + trimmed_strains.append(strain) + print("trimmed_strains:", pf(trimmed_strains)) + return trimmed_strains + +def write_data_matrix_file(strains, probeset_vals, filename): + with open(filename, "wb") as fh: + csv_writer = csv.writer(fh, delimiter=",", quoting=csv.QUOTE_ALL) + #print("strains is:", pf(strains)) + csv_writer.writerow(['ID'] + strains) + for counter, probeset in enumerate(probeset_vals): + row_data = [probeset] + for strain in strains: + #print("probeset is: ", pf(probeset_vals[probeset])) + row_data.append(probeset_vals[probeset][strain]) + #print("row_data is: ", pf(row_data)) + csv_writer.writerow(row_data) + show_progress("Writing", counter) + +def main(): + filename = os.path.expanduser("~/gene/wqflask/maintenance/" + + "ProbeSetFreezeId_210_FullName_Eye_AXBXA_Illumina_V6.2" + + "(Oct08)_RankInv_Beta.txt") + dataset_name = "Eye_AXBXA_1008_RankInv" + + cursor = get_cursor() + strains = get_strains(cursor) + print("Getting probset_vals") + probeset_vals = get_probeset_vals(cursor, dataset_name) + print("Finished getting probeset_vals") + trimmed_strains = trim_strains(strains, probeset_vals) + write_data_matrix_file(trimmed_strains, probeset_vals, filename) + +if __name__ == '__main__': + main() diff --git a/wqflask/maintenance/print_benchmark.py b/wqflask/maintenance/print_benchmark.py new file mode 100644 index 00000000..540e0904 --- /dev/null +++ b/wqflask/maintenance/print_benchmark.py @@ -0,0 +1,43 @@ +#!/usr/bin/python + +from __future__ import absolute_import, print_function, division + +import time + +from pprint import pformat as pf + + +class TheCounter(object): + Counters = {} + + def __init__(self): + start_time = time.time() + for counter in range(170000): + self.print_it(counter) + self.time_took = time.time() - start_time + TheCounter.Counters[self.__class__.__name__] = self.time_took + +class PrintAll(TheCounter): + def print_it(self, counter): + print(counter) + +class PrintSome(TheCounter): + def print_it(self, counter): + if counter % 1000 == 0: + print(counter) + +class PrintNone(TheCounter): + def print_it(self, counter): + pass + + +def new_main(): + print("Running new_main") + tests = [PrintAll, PrintSome, PrintNone] + for test in tests: + test() + + print(pf(TheCounter.Counters)) + +if __name__ == '__main__': + new_main()
\ No newline at end of file diff --git a/wqflask/maintenance/quick_search_table.py b/wqflask/maintenance/quick_search_table.py new file mode 100644 index 00000000..b07e7656 --- /dev/null +++ b/wqflask/maintenance/quick_search_table.py @@ -0,0 +1,503 @@ +"""Creates a table used for the quick search feature. + +One column contains the terms to match the user's search against. Another contains the result +fields in json format + +Results will be returned for each of several trait types: mRNA assays, phenotypes, genotypes, and +(maybe later) genes + +For each trait type, the results for each species should be given This table will then list +each trait, its dataset, and several columns determined by its trait type (phenotype, genotype, etc) + +""" + +from __future__ import print_function, division, absolute_import + +import sys +sys.path.append("../../..") + +import simplejson as json + +import sqlalchemy as sa +from sqlalchemy.orm import scoped_session, sessionmaker +from sqlalchemy.ext.declarative import declarative_base + +#from pprint import pformat as pf + +import zach_settings as settings + +Engine = sa.create_engine(settings.SQLALCHEMY_DATABASE_URI, + #encoding='utf-8', + #client_encoding='utf-8', + #echo="debug",w + ) + +Session = scoped_session(sessionmaker(bind=Engine)) #, extension=VersionedListener())) + +Base = declarative_base(bind=Engine) +Metadata = sa.MetaData() +Metadata.bind = Engine + +class PublishXRef(Base): + """Class that corresponds with the PublishXRef table in the database. + + The PublishXRef table links phenotype traits and their publications. + + This class is used to add phenotype traits to the quick search table. + + """ + + __tablename__ = 'PublishXRef' + + Id = sa.Column(sa.Integer, primary_key=True) + InbredSetId = sa.Column(sa.Integer, primary_key=True) + PhenotypeId = sa.Column(sa.Integer) + PublicationId = sa.Column(sa.Integer) + DataId = sa.Column(sa.Integer) + Locus = sa.Column(sa.Text) + LRS = sa.Column(sa.Float) + additive = sa.Column(sa.Float) + Sequence = sa.Column(sa.Integer) + comments = sa.Column(sa.Text) + + @classmethod + def run(cls): + """Connects to database and inserts phenotype trait info into the Quicksearch table.""" + conn = Engine.connect() + counter = 0 + for pub_row in page_query(Session.query(cls)): #all() + values = {} + values['table_name'] = cls.__tablename__ + values['the_key'] = json.dumps([pub_row.Id, pub_row.InbredSetId]) + values['terms'] = cls.get_unique_terms(pub_row.Id, pub_row.InbredSetId) + print("terms is:", values['terms']) + values['result_fields'] = cls.get_result_fields(pub_row.Id, pub_row.InbredSetId) + ins = QuickSearch.insert().values(**values) + conn.execute(ins) + counter += 1 + print("Done:", counter) + + @staticmethod + def get_unique_terms(publishxref_id, inbredset_id): + """Finds unique terms for each item in the PublishXRef table to match a query against""" + results = Session.query( + "pre_publication_description", + "post_publication_description", + "pre_publication_abbreviation", + "post_publication_abbreviation", + "publication_title" + ).from_statement( + "SELECT Phenotype.Pre_publication_description as pre_publication_description, " + "Phenotype.Post_publication_description as post_publication_description, " + "Phenotype.Pre_publication_abbreviation as pre_publication_abbreviation, " + "Phenotype.Post_publication_abbreviation as post_publication_abbreviation, " + "Publication.Title as publication_title " + "FROM Phenotype, Publication, PublishXRef " + "WHERE PublishXRef.Id = :publishxref_id and " + "PublishXRef.InbredSetId = :inbredset_id and " + "PublishXRef.PhenotypeId = Phenotype.Id and " + "PublishXRef.PublicationId = Publication.Id ").params(publishxref_id=publishxref_id, + inbredset_id=inbredset_id).all() + + unique = set() + for item in results[0]: + #print("locals:", locals()) + if not item: + continue + for token in item.split(): + if token.startswith(('(','[')): + token = token[1:] + if token.endswith((')', ']')): + token = token[:-1] + if token.endswith(';'): + token = token[:-1] + if len(token) > 2: + try: + # This hopefully ensures that the token is utf-8 + token = token.encode('utf-8') + print(" ->", token) + except UnicodeDecodeError: + print("\n-- UDE \n") + # Can't get it into utf-8, we won't use it + continue + + unique.add(token) + #print("\nUnique terms are: {}\n".format(unique)) + return " ".join(unique) + + @staticmethod + def get_result_fields(publishxref_id, inbredset_id): + """Gets the result fields (columns) that appear on the result page as a json string""" + results = Session.query( + "phenotype_id", + "species", + "group_name", + "description", + "lrs", + "publication_id", + "pubmed_id", + "year", + "authors" + ).from_statement( + "SELECT PublishXRef.PhenotypeId as phenotype_id, " + "Species.Name as species, " + "InbredSet.Name as group_name, " + "Phenotype.Original_description as description, " + "PublishXRef.LRS as lrs, " + "PublishXRef.PublicationId as publication_id, " + "Publication.PubMed_ID as pubmed_id, " + "Publication.Year as year, " + "Publication.Authors as authors " + "FROM PublishXRef, " + "Phenotype, " + "Publication, " + "InbredSet, " + "Species " + "WHERE PublishXRef.Id = :publishxref_id and " + "PublishXRef.InbredSetId = :inbredset_id and " + "PublishXRef.PhenotypeId = Phenotype.Id and " + "PublishXRef.PublicationId = Publication.Id and " + "InbredSet.Id = :inbredset_id and " + "Species.Id = InbredSet.SpeciesId ").params(publishxref_id=publishxref_id, + inbredset_id=inbredset_id).all() + + assert len(set(result for result in results)) == 1, "Different results or no results" + + result = results[0] + result = row2dict(result) + try: + json_results = json.dumps(result, sort_keys=True) + except UnicodeDecodeError: + print("\n\nTrying to massage unicode\n\n") + for key, value in result.iteritems(): + print("\tkey is:", key) + print("\tvalue is:", value) + if isinstance(value, basestring): + result[key] = value.decode('utf-8', errors='ignore') + json_results = json.dumps(result, sort_keys=True) + + return json_results + + +class GenoXRef(Base): + """Class that corresponds with the GenoXRef table in the database. + + The GenoXRef table links genotype traits and their data. + + This class is used to add genotype traits to the quick search table. + + """ + + __tablename__ = 'GenoXRef' + + GenoFreezeId = sa.Column(sa.Integer, primary_key=True) + GenoId = sa.Column(sa.Integer, primary_key=True) + DataId = sa.Column(sa.Integer) + cM = sa.Column(sa.Float) + Used_for_mapping = sa.Column(sa.Text) + + @classmethod + def run(cls): + """Connects to database and inserts genotype trait info into the Quicksearch table.""" + conn = Engine.connect() + counter = 0 + for item in page_query(Session.query(cls)): #all() + values = {} + values['table_name'] = cls.__tablename__ + values['the_key'] = json.dumps([item.GenoId, item.GenoFreezeId]) + values['terms'] = cls.get_unique_terms(item.GenoId) + print("terms is:", values['terms']) + if values['terms']: + values['result_fields'] = cls.get_result_fields(item.GenoId, item.GenoFreezeId) + ins = QuickSearch.insert().values(**values) + conn.execute(ins) + counter += 1 + print("Done:", counter) + + @staticmethod + def get_unique_terms(geno_id): + """Finds unique terms for each item in the GenoXRef table to match a query against""" + print("geno_id: ", geno_id) + results = Session.query( + "name", + "marker_name" + ).from_statement( + "SELECT Geno.Name as name, " + "Geno.Marker_Name as marker_name " + "FROM Geno " + "WHERE Geno.Id = :geno_id ").params(geno_id=geno_id).all() + + unique = set() + if len(results): + for item in results[0]: + #print("locals:", locals()) + if not item: + continue + for token in item.split(): + if len(token) > 2: + try: + # This hopefully ensures that the token is utf-8 + token = token.encode('utf-8') + print(" ->", token) + except UnicodeDecodeError: + print("\n-- UDE \n") + # Can't get it into utf-8, we won't use it + continue + + unique.add(token) + return " ".join(unique) + + + @staticmethod + def get_result_fields(geno_id, dataset_id): + """Gets the result fields (columns) that appear on the result page as a json string""" + results = Session.query( + "name", + "marker_name", + "group_name", + "species", + "dataset", + "dataset_name", + "chr", "mb", + "source" + ).from_statement( + "SELECT Geno.Name as name, " + "Geno.Marker_Name as marker_name, " + "InbredSet.Name as group_name, " + "Species.Name as species, " + "GenoFreeze.Name as dataset, " + "GenoFreeze.FullName as dataset_name, " + "Geno.Chr as chr, " + "Geno.Mb as mb, " + "Geno.Source as source " + "FROM Geno, " + "GenoXRef, " + "GenoFreeze, " + "InbredSet, " + "Species " + "WHERE Geno.Id = :geno_id and " + "GenoXRef.GenoId = Geno.Id and " + "GenoFreeze.Id = :dataset_id and " + "GenoXRef.GenoFreezeId = GenoFreeze.Id and " + "InbredSet.Id = GenoFreeze.InbredSetId and " + "InbredSet.SpeciesId = Species.Id ").params(geno_id=geno_id, + dataset_id=dataset_id).all() + assert len(set(result for result in results)) == 1, "Different results" + + result = results[0] + result = row2dict(result) + try: + json_results = json.dumps(result, sort_keys=True) + except UnicodeDecodeError: + print("\n\nTrying to massage unicode\n\n") + for key, value in result.iteritems(): + print("\tkey is:", key) + print("\tvalue is:", value) + if isinstance(value, basestring): + result[key] = value.decode('utf-8', errors='ignore') + json_results = json.dumps(result, sort_keys=True) + + return json_results + +class ProbeSetXRef(Base): + """Class that corresponds with the ProbeSetXRef table in the database. + + The ProbeSetXRef table links mRNA expression traits and their sample data. + + This class is used to add mRNA expression traits to the quick search table. + + """ + + __tablename__ = 'ProbeSetXRef' + + ProbeSetFreezeId = sa.Column(sa.Integer, primary_key=True) + ProbeSetId = sa.Column(sa.Integer, primary_key=True) + DataId = sa.Column(sa.Integer, unique=True) + Locus_old = sa.Column(sa.Text) + LRS_old = sa.Column(sa.Float) + pValue_old = sa.Column(sa.Float) + mean = sa.Column(sa.Float) + se = sa.Column(sa.Float) + Locus = sa.Column(sa.Text) + LRS = sa.Column(sa.Float) + pValue = sa.Column(sa.Float) + additive = sa.Column(sa.Float) + h2 = sa.Column(sa.Float) + + @classmethod + def run(cls): + """Connects to db and inserts mRNA expression trait info into the Quicksearch table.""" + conn = Engine.connect() + counter = 0 + for ps_row in page_query(Session.query(cls)): #all() + values = {} + values['table_name'] = cls.__tablename__ + values['the_key'] = json.dumps([ps_row.ProbeSetId, ps_row.ProbeSetFreezeId]) + values['terms'] = cls.get_unique_terms(ps_row.ProbeSetId) + print("terms is:", values['terms']) + values['result_fields'] = cls.get_result_fields(ps_row.ProbeSetId, + ps_row.ProbeSetFreezeId) + if values['result_fields'] == None: + continue + ins = QuickSearch.insert().values(**values) + conn.execute(ins) + counter += 1 + print("Done:", counter) + + @staticmethod + def get_unique_terms(probeset_id): + """Finds unique terms for each item in the ProbeSetXRef table to match a query against""" + results = Session.query( + "name", + "symbol", + "description", + "alias" + ).from_statement( + "SELECT ProbeSet.Name as name, " + "ProbeSet.Symbol as symbol, " + "ProbeSet.description as description, " + "ProbeSet.alias as alias " + "FROM ProbeSet " + "WHERE ProbeSet.Id = :probeset_id ").params(probeset_id=probeset_id).all() + + unique = set() + if len(results): + for item in results[0]: + if not item: + continue + for token in item.split(): + if token.startswith(('(','[')): + token = token[1:] + if token.endswith((')', ']')): + token = token[:-1] + if token.endswith(';'): + token = token[:-1] + if len(token) > 2: + try: + # This hopefully ensures that the token is utf-8 + token = token.encode('utf-8') + print(" ->", token) + except UnicodeDecodeError: + print("\n-- UDE \n") + # Can't get it into utf-8, we won't use it + continue + + unique.add(token) + return " ".join(unique) + + + @staticmethod + def get_result_fields(probeset_id, dataset_id): + """Gets the result fields (columns) that appear on the result page as a json string""" + print("probeset_id: ", probeset_id) + print("dataset_id: ", dataset_id) + results = Session.query( + "name", + "species", + "group_name", + "dataset", + "dataset_name", + "symbol", + "description", + "chr", "mb", + "lrs", "mean", + "genbank_id", + "gene_id", + "chip_id", + "chip_name" + ).from_statement( + "SELECT ProbeSet.Name as name, " + "Species.Name as species, " + "InbredSet.Name as group_name, " + "ProbeSetFreeze.Name as dataset, " + "ProbeSetFreeze.FullName as dataset_name, " + "ProbeSet.Symbol as symbol, " + "ProbeSet.description as description, " + "ProbeSet.Chr as chr, " + "ProbeSet.Mb as mb, " + "ProbeSetXRef.LRS as lrs, " + "ProbeSetXRef.mean as mean, " + "ProbeSet.GenbankId as genbank_id, " + "ProbeSet.GeneId as gene_id, " + "ProbeSet.ChipId as chip_id, " + "GeneChip.Name as chip_name " + "FROM ProbeSet, " + "ProbeSetXRef, " + "ProbeSetFreeze, " + "ProbeFreeze, " + "InbredSet, " + "Species, " + "GeneChip " + "WHERE ProbeSetXRef.ProbeSetId = :probeset_id and " + "ProbeSetXRef.ProbeSetFreezeId = :dataset_id and " + "ProbeSetXRef.ProbeSetId = ProbeSet.Id and " + "ProbeSet.ChipId = GeneChip.Id and " + "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and " + "ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id and " + "ProbeFreeze.InbredSetId = InbredSet.Id and " + "InbredSet.SpeciesId = Species.Id ").params(probeset_id=probeset_id, + dataset_id=dataset_id).all() + + if len(set(result for result in results)) != 1: + return None + + result = results[0] + result = row2dict(result) + try: + json_results = json.dumps(result, sort_keys=True) + except UnicodeDecodeError: + print("\n\nTrying to massage unicode\n\n") + for key, value in result.iteritems(): + print("\tkey is:", key) + print("\tvalue is:", value) + if isinstance(value, basestring): + result[key] = value.decode('utf-8', errors='ignore') + json_results = json.dumps(result, sort_keys=True) + + return json_results + +QuickSearch = sa.Table("QuickSearch", Metadata, + # table_name is the table that item is inserted from + sa.Column('table_name', sa.String(15), + primary_key=True, nullable=False, autoincrement=False), + sa.Column('the_key', sa.String(30), + primary_key=True, nullable=False, autoincrement=False), # key in database table + sa.Column('terms', sa.Text), # terms to compare search string with + sa.Column('result_fields', sa.Text), # json + mysql_engine = 'MyISAM', + ) + +#QuickSearch.drop(Engine, checkfirst=True) +#Metadata.create_all(Engine) + + +def row2dict(row): + """From http://stackoverflow.com/a/2848519/1175849""" + return dict(zip(row.keys(), row)) + + +def page_query(query): + """From http://stackoverflow.com/a/1217947/1175849""" + offset = 0 + while True: + rrr = False + for elem in query.limit(1000).offset(offset): + rrr = True + yield elem + offset += 1000 + if not rrr: + break + + +def main(): + """Populate the QuickSearch table that is used with the quick search features. + + Add all items from the ProbeSetXRef, GenoXRef, and PublishXRef tables to the QuickSearch tables. + + """ + ProbeSetXRef.run() + #GenoXRef.run() + #PublishXRef.run() + +if __name__ == "__main__": + main()
\ No newline at end of file |