From f2af96043989bf36d2961496aaef61adbe3d9701 Mon Sep 17 00:00:00 2001 From: Zachary Sloan Date: Thu, 28 Mar 2013 22:46:19 +0000 Subject: quick_search_table.py seems to be running OK for the PublishXRef table, but will need to check later to make sure --- wqflask/maintenance/quick_search_table.py | 334 +++++++++++++----------------- wqflask/wqflask/my_pylmm/pyLMM/lmm.py | 72 +++---- 2 files changed, 184 insertions(+), 222 deletions(-) (limited to 'wqflask') diff --git a/wqflask/maintenance/quick_search_table.py b/wqflask/maintenance/quick_search_table.py index 60446f79..48697e58 100644 --- a/wqflask/maintenance/quick_search_table.py +++ b/wqflask/maintenance/quick_search_table.py @@ -24,8 +24,6 @@ from sqlalchemy.orm import scoped_session, sessionmaker, relationship, backref from sqlalchemy.orm.exc import NoResultFound from sqlalchemy.ext.declarative import declarative_base -from BeautifulSoup import UnicodeDammit - import zach_settings as settings Engine = sa.create_engine(settings.SQLALCHEMY_DATABASE_URI, @@ -35,13 +33,147 @@ Engine = sa.create_engine(settings.SQLALCHEMY_DATABASE_URI, ) Session = scoped_session(sessionmaker(bind=Engine)) #, extension=VersionedListener())) -#Xsession = Session() Base = declarative_base(bind=Engine) Metadata = sa.MetaData() Metadata.bind = Engine +class PublishXRef(Base): + __tablename__ = 'PublishXRef' + + Id = sa.Column(sa.Integer, primary_key=True) + InbredSetId = sa.Column(sa.Integer, primary_key=True) + PhenotypeId = sa.Column(sa.Integer) + PublicationId = sa.Column(sa.Integer) + DataId = sa.Column(sa.Integer) + Locus = sa.Column(sa.Text) + LRS = sa.Column(sa.Float) + additive = sa.Column(sa.Float) + Sequence = sa.Column(sa.Integer) + comments = sa.Column(sa.Text) + + @classmethod + def run(cls): + conn = Engine.connect() + counter = 0 + for ps in page_query(Session.query(cls)): #all() + values = {} + values['table_name'] = cls.__tablename__ + values['the_key'] = json.dumps([ps.Id, ps.InbredSetId]) + values['terms'] = cls.get_unique_terms(ps.Id, ps.InbredSetId) + print("terms is:", values['terms']) + values['result_fields'] = cls.get_result_fields(ps.Id, ps.InbredSetId) + ins = QuickSearch.insert().values(**values) + conn.execute(ins) + counter += 1 + print("Done:", counter) + + @staticmethod + def get_unique_terms(publishxref_id, inbredset_id): + results = Session.query( + "pre_publication_description", + "post_publication_description", + "pre_publication_abbreviation", + "post_publication_abbreviation", + "publication_title" + ).from_statement( + "SELECT Phenotype.Pre_publication_description as pre_publication_description, " + "Phenotype.Post_publication_description as post_publication_description, " + "Phenotype.Pre_publication_abbreviation as pre_publication_abbreviation, " + "Phenotype.Post_publication_abbreviation as post_publication_abbreviation, " + "Publication.Title as publication_title " + "FROM Phenotype, Publication, PublishXRef " + "WHERE PublishXRef.Id = :publishxref_id and " + "PublishXRef.InbredSetId = :inbredset_id and " + "PublishXRef.PhenotypeId = Phenotype.Id and " + "PublishXRef.PublicationId = Publication.Id ").params(publishxref_id=publishxref_id, + inbredset_id=inbredset_id).all() + + unique = set() + for item in results[0]: + #print("locals:", locals()) + if not item: + continue + for token in item.split(): + if token.startswith(('(','[')): + token = token[1:] + if token.endswith((')', ']')): + token = token[:-1] + if token.endswith(';'): + token = token[:-1] + if len(token) > 2: + try: + # This hopefully ensures that the token is utf-8 + token = token.encode('utf-8') + print(" ->", token) + except UnicodeDecodeError: + print("\n-- UDE \n") + # Can't get it into utf-8, we won't use it + continue + + unique.add(token) + print("\nUnique terms are: {}\n".format(unique)) + return " ".join(unique) + + @staticmethod + def get_result_fields(publishxref_id, inbredset_id): + results = Session.query( + "phenotype_id", + "species", + "group_name", + "description", + "lrs", + "publication_id", + "year", + "authors" + ).from_statement( + "SELECT PublishXRef.PhenotypeId as phenotype_id, " + "Species.Name as species, " + "InbredSet.Name as group_name, " + "Phenotype.Original_description as description, " + "PublishXRef.LRS as lrs, " + "PublishXRef.PublicationId as publication_id, " + "Publication.Year as year, " + "Publication.Authors as authors " + "FROM PublishXRef, " + "Phenotype, " + "Publication, " + "InbredSet, " + "Species " + "WHERE PublishXRef.Id = :publishxref_id and " + "PublishXRef.InbredSetId = :inbredset_id and " + "PublishXRef.PhenotypeId = Phenotype.Id and " + "PublishXRef.PublicationId = Publication.Id and " + "InbredSet.Id = :inbredset_id and " + "Species.Id = InbredSet.SpeciesId ").params(publishxref_id=publishxref_id, + inbredset_id=inbredset_id).all() + #"InbredSet.SpeciesId = Species.Id and " + #"Geno.SpeciesId = Species.Id and " + #"Geno.Name = PublishXRef.Locus ").params(publishxref_id=publishxref_id, + # inbredset_id=inbredset_id).all() + for result in results: + print("****", result) + + assert len(set(result for result in results)) == 1, "Different results or no results" + + print("results are:", results) + result = results[0] + result = row2dict(result) + try: + json_results = json.dumps(result, sort_keys=True) + except UnicodeDecodeError: + print("\n\nTrying to massage unicode\n\n") + for key, value in result.iteritems(): + print("\tkey is:", key) + print("\tvalue is:", value) + if isinstance(value, basestring): + result[key] = value.decode('utf-8', errors='ignore') + json_results = json.dumps(result, sort_keys=True) + return json_results + + + class ProbeSetXRef(Base): __tablename__ = 'ProbeSetXRef' @@ -59,8 +191,6 @@ class ProbeSetXRef(Base): additive = sa.Column(sa.Float) h2 = sa.Column(sa.Float) - #__mapper_args__ = {'primary_key':[ProbeSetXRef.ProbeSetId, ProbeSetXRef.ProbeSetFreezeId]} - @classmethod def run(cls): conn = Engine.connect() @@ -76,7 +206,7 @@ class ProbeSetXRef(Base): ins = QuickSearch.insert().values(**values) conn.execute(ins) counter += 1 - print("Done:", counter) + print("Done:", counter) @staticmethod def get_unique_terms(probeset_id): @@ -119,31 +249,13 @@ class ProbeSetXRef(Base): print("\nUnique terms are: {}\n".format(unique)) return " ".join(unique) - #def get_species(dataset_id): - # print("Before species query") - # results = Session.query("Name").from_statement("SELECT Species.Name " - # "FROM ProbeSetXRef, " - # "ProbeSetFreeze, " - # "ProbeFreeze, " - # "InbredSet, " - # "Species " - # "WHERE ProbeSetFreeze.Id =:probeset_freeze_id and " - # "ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id and " - # "ProbeFreeze.InbredSetId = InbredSet.Id and " - # "InbredSet.SpeciesId = Species.Id").params(probeset_freeze_id=dataset_id).all() - # print("After query") - # - # assert len(set([result.Name for result in results])) == 1, "Multiple names?" - # - # print("species is:", results[0].Name) - # - # return results[0].Name @staticmethod def get_result_fields(probeset_id, dataset_id): results = Session.query( "name", "species", + "group", "dataset", "dataset_name", "symbol", @@ -157,6 +269,7 @@ class ProbeSetXRef(Base): ).from_statement( "SELECT ProbeSet.Name as name, " "Species.Name as species, " + "InbredSet.Name as group, " "ProbeSetFreeze.Name as dataset, " "ProbeSetFreeze.FullName as dataset_name, " "ProbeSet.Symbol as symbol, " @@ -201,183 +314,27 @@ class ProbeSetXRef(Base): if isinstance(value, basestring): result[key] = value.decode('utf-8', errors='ignore') json_results = json.dumps(result, sort_keys=True) - - #print("json is: ", json_results) - + return json_results + QuickSearch = sa.Table("QuickSearch", Metadata, sa.Column('table_name', sa.String(15), primary_key=True, nullable=False, autoincrement=False), # table that item is inserted from sa.Column('the_key', sa.String(30), primary_key=True, nullable=False, autoincrement=False), # key in database table sa.Column('terms', sa.Text), # terms to compare search string with - #sa.Column('species', sa.Text), sa.Column('result_fields', sa.Text) # json ) QuickSearch.drop(Engine, checkfirst=True) Metadata.create_all(Engine) -#class QuickSearch(Base): -# table_name = Column(String) -# the_key = Column(String) -# terms = Column(String) -# -# def __init__(self, table_name, the_key, terms, category, species, result_fields): -# self.table_name = table_name -# self.the_key = the_key -# self.terms = terms -# self.species = species -# self.category = category -# self.result_fields = json.dumps(sort_keys=True) - - -def get_unique_terms(trait_type, trait_id): - #if not args: - # return None - - if trait_type=="ProbeSet": - results = Session.query( - "name", - "symbol", - "description", - "alias" - ).from_statement( - "SELECT ProbeSet.Name as name, " - "ProbeSet.Symbol as symbol, " - "ProbeSet.description as description, " - "ProbeSet.alias as alias " - "FROM ProbeSet" - "WHERE ProbeSet.Id = :probeset_id ").params(probeset_id=trait_id).all() - - unique = set() - for item in results[0]: - #print("locals:", locals()) - if not item: - continue - for token in item.split(): - if token.startswith(('(','[')): - token = token[1:] - if token.endswith((')', ']')): - token = token[:-1] - if token.endswith(';'): - token = token[:-1] - if len(token) > 2: - try: - # This hopefully ensures that the token is utf-8 - token = token.encode('utf-8') - print(" ->", token) - except UnicodeDecodeError: - print("\n-- UDE \n") - # Can't get it into utf-8, we won't use it - continue - - unique.add(token) - print("\nUnique terms are: {}\n".format(unique)) - return " ".join(unique) - -def main(): - conn = Engine.connect() - counter = 0 - - ProbeSetXRef.run() - - #for ps in page_query(Session.query(ProbeSet)): #all() - # values = {} - # values['table_name'] = "ProbeSetXRef" - # values['the_key'] = json.dumps([ps.ProbeSetId, ps.ProbeSetFreezeId]) - # values['terms'] = get_unique_terms("ProbeSet", ps.ProbeSetId) - # print("terms is:", values['terms']) - # #values['species'] = get_species("ProbeSet", ps.Id) - # values['result_fields'] = get_result_fields("ProbeSet", ps.ProbeSetId, ps.ProbeSetFreezeId) - # ins = QuickSearch.insert().values(**values) - # conn.execute(ins) - # counter += 1 - # print("Done:", counter) - - -def get_species(trait_type, trait_id): - if trait_type == "ProbeSet": - print("Before species query") - results = Session.query("Name").from_statement("SELECT Species.Name " - "FROM ProbeSetXRef, " - "ProbeSetFreeze, " - "ProbeFreeze, " - "InbredSet, " - "Species " - "WHERE ProbeSetXRef.ProbeSetId =:probeset_id and " - "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and " - "ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id and " - "ProbeFreeze.InbredSetId = InbredSet.Id and " - "InbredSet.SpeciesId = Species.Id").params(probeset_id=trait_id).all() - print("After query") - - assert len(set([result.Name for result in results])) == 1, "Multiple names?" - - print("species is:", results[0].Name) - - return results[0].Name - -#def get_result_fields(trait_type, *args): -# if trait_type == "ProbeSet": -# print("qs1") -# results = Session.query( -# "name", -# "symbol", -# "description", -# "chr", "mb", -# "genbank_id", -# "gene_id", -# "chip_id", -# "chip_name" -# ).from_statement( -# "SELECT ProbeSet.Name as name, " -# "ProbeSet.Symbol as symbol, " -# "ProbeSet.description as description, " -# "ProbeSet.Chr as chr, " -# "ProbeSet.Mb as mb, " -# "ProbeSet.GenbankId as genbank_id, " -# "ProbeSet.GeneId as gene_id, " -# "ProbeSet.ChipId as chip_id, " -# "GeneChip.Name as chip_name " -# "FROM ProbeSet, GeneChip " -# "WHERE ProbeSet.ChipId = GeneChip.Id and " -# "ProbeSet.Id = :probeset_id ").params(probeset_id=*args[0], dataset_id=*args[1]).all() -# print("qs2") -# for result in results: -# print(result) -# assert len(set(result for result in results)) == 1, "Different results" -# -# print("results are:", results) -# result = results[0] -# result = row2dict(result) -# try: -# json_results = json.dumps(result, sort_keys=True) -# except UnicodeDecodeError: -# print("\n\nTrying to massage unicode\n\n") -# #print("result.__dict__ is [{}]: {}".format(type(result.__dict__), result.__dict__)) -# #resultd = dict(**result.__dict__) -# for key, value in result.iteritems(): -# print(" key is:", key) -# print(" value is:", value) -# if isinstance(value, basestring): -# result[key] = value.decode('utf-8', errors='ignore') -# json_results = json.dumps(result, sort_keys=True) -# -# #print("json is: ", json_results) -# -# return json_results - def row2dict(row): - return dict(zip(row.keys(), row)) # http://stackoverflow.com/a/2848519/1175849 - #"""http://stackoverflow.com/a/1960546/1175849""" - #d = {} - #for column in row.__table__.columns: - # d[column.name] = getattr(row, column.name) - # - #return d + """http://stackoverflow.com/a/2848519/1175849""" + return dict(zip(row.keys(), row)) + def page_query(q): """http://stackoverflow.com/a/1217947/1175849""" @@ -391,5 +348,10 @@ def page_query(q): if not r: break + +def main(): + PublishXRef.run() + ProbeSetXRef.run() + if __name__ == "__main__": main() \ No newline at end of file diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py index 7ed0f3e5..163b876a 100644 --- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py +++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py @@ -26,42 +26,42 @@ from scipy import stats from pprint import pformat as pf -#from utility.benchmark import Bench -# -##np.seterr('raise') -# -#def run(pheno_vector, -# genotype_matrix, -# restricted_max_likelihood=True, -# refit=False, -# temp_data=None): -# """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics -# -# restricted_max_likelihood -- whether to use restricted max likelihood; True or False -# refit -- whether to refit the variance component for each marker -# temp_data -- TempData object that stores the progress for each major step of the -# calculations ("calculate_kinship" and "GWAS" take the majority of time) -# -# """ -# -# with Bench("Calculate Kinship"): -# kinship_matrix = calculate_kinship(genotype_matrix, temp_data) -# -# with Bench("Create LMM object"): -# lmm_ob = LMM(pheno_vector, kinship_matrix) -# -# with Bench("LMM_ob fitting"): -# lmm_ob.fit() -# -# with Bench("Doing GWAS"): -# t_stats, p_values = GWAS(pheno_vector, -# genotype_matrix, -# kinship_matrix, -# restricted_max_likelihood=True, -# refit=False, -# temp_data=temp_data) -# Bench().report() -# return t_stats, p_values +from utility.benchmark import Bench + +#np.seterr('raise') + +def run(pheno_vector, + genotype_matrix, + restricted_max_likelihood=True, + refit=False, + temp_data=None): + """Takes the phenotype vector and genotype matrix and returns a set of p-values and t-statistics + + restricted_max_likelihood -- whether to use restricted max likelihood; True or False + refit -- whether to refit the variance component for each marker + temp_data -- TempData object that stores the progress for each major step of the + calculations ("calculate_kinship" and "GWAS" take the majority of time) + + """ + + with Bench("Calculate Kinship"): + kinship_matrix = calculate_kinship(genotype_matrix, temp_data) + + with Bench("Create LMM object"): + lmm_ob = LMM(pheno_vector, kinship_matrix) + + with Bench("LMM_ob fitting"): + lmm_ob.fit() + + with Bench("Doing GWAS"): + t_stats, p_values = GWAS(pheno_vector, + genotype_matrix, + kinship_matrix, + restricted_max_likelihood=True, + refit=False, + temp_data=temp_data) + Bench().report() + return t_stats, p_values def matrixMult(A,B): -- cgit v1.2.3