aboutsummaryrefslogtreecommitdiff
path: root/wqflask/maintenance
diff options
context:
space:
mode:
Diffstat (limited to 'wqflask/maintenance')
-rw-r--r--wqflask/maintenance/generate_probesetfreeze_file.py123
-rw-r--r--wqflask/maintenance/quick_search_table.py484
2 files changed, 607 insertions, 0 deletions
diff --git a/wqflask/maintenance/generate_probesetfreeze_file.py b/wqflask/maintenance/generate_probesetfreeze_file.py
new file mode 100644
index 00000000..91a2b8a1
--- /dev/null
+++ b/wqflask/maintenance/generate_probesetfreeze_file.py
@@ -0,0 +1,123 @@
+#!/usr/bin/python
+
+from __future__ import absolute_import, print_function, division
+
+import sys
+
+sys.path.insert(0, "..")
+
+import os
+import collections
+import csv
+
+import MySQLdb
+
+from base import webqtlConfig
+
+from pprint import pformat as pf
+
+
+def get_cursor():
+ con = MySQLdb.Connect(db=webqtlConfig.DB_UPDNAME,
+ host=webqtlConfig.MYSQL_UPDSERVER,
+ user=webqtlConfig.DB_UPDUSER,
+ passwd=webqtlConfig.DB_UPDPASSWD)
+ cursor = con.cursor()
+ return cursor
+
+def show_progress(process, counter):
+ if counter % 1000 == 0:
+ print("{}: {}".format(process, counter))
+
+def get_strains(cursor):
+ cursor.execute("""select Strain.Name
+ from Strain, StrainXRef, InbredSet
+ where Strain.Id = StrainXRef.StrainId and
+ StrainXRef.InbredSetId = InbredSet.Id
+ and InbredSet.Name=%s;
+ """, "BXD")
+
+ strains = [strain[0] for strain in cursor.fetchall()]
+ print("strains:", pf(strains))
+ for strain in strains:
+ print(" -", strain)
+
+ return strains
+
+def get_probeset_vals(cursor, dataset_name):
+ cursor.execute(""" select ProbeSet.Id, ProbeSet.Name
+ from ProbeSetXRef,
+ ProbeSetFreeze,
+ ProbeSet
+ where ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and
+ ProbeSetFreeze.Name = %s and
+ ProbeSetXRef.ProbeSetId = ProbeSet.Id;
+ """, dataset_name)
+
+ probesets = cursor.fetchall()
+
+ print("Fetched probesets")
+
+ probeset_vals = collections.OrderedDict()
+
+ for counter, probeset in enumerate(probesets):
+ cursor.execute(""" select Strain.Name, ProbeSetData.value
+ from ProbeSetData, ProbeSetXRef, ProbeSetFreeze, Strain
+ where ProbeSetData.Id = ProbeSetXRef.DataId
+ and ProbeSetData.StrainId = Strain.Id
+ and ProbeSetXRef.ProbeSetId = %s
+ and ProbeSetFreeze.Id = ProbeSetXRef.ProbeSetFreezeId
+ and ProbeSetFreeze.Name = %s;
+ """, (probeset[0], dataset_name))
+ val_dic = collections.OrderedDict()
+ vals = cursor.fetchall()
+ for val in vals:
+ val_dic[val[0]] = val[1]
+
+ probeset_vals[probeset[1]] = val_dic
+ show_progress("Querying DB", counter)
+
+ return probeset_vals
+
+def trim_strains(strains, probeset_vals):
+ trimmed_strains = []
+ #print("probeset_vals is:", pf(probeset_vals))
+ first_probeset = list(probeset_vals.itervalues())[0]
+ print("\n**** first_probeset is:", pf(first_probeset))
+ for strain in strains:
+ print("\n**** strain is:", pf(strain))
+ if strain in first_probeset:
+ trimmed_strains.append(strain)
+ print("trimmed_strains:", pf(trimmed_strains))
+ return trimmed_strains
+
+def write_data_matrix_file(strains, probeset_vals, filename):
+ with open(filename, "wb") as fh:
+ csv_writer = csv.writer(fh, delimiter=",", quoting=csv.QUOTE_ALL)
+ #print("strains is:", pf(strains))
+ csv_writer.writerow(['ID'] + strains)
+ for counter, probeset in enumerate(probeset_vals):
+ row_data = [probeset]
+ for strain in strains:
+ #print("probeset is: ", pf(probeset_vals[probeset]))
+ row_data.append(probeset_vals[probeset][strain])
+ #print("row_data is: ", pf(row_data))
+ csv_writer.writerow(row_data)
+ show_progress("Writing", counter)
+
+def main():
+ filename = os.path.expanduser("~/gene/wqflask/maintenance/" +
+ "ProbeSetFreezeId_210_FullName_Eye_AXBXA_Illumina_V6.2" +
+ "(Oct08)_RankInv_Beta.txt")
+ dataset_name = "Eye_AXBXA_1008_RankInv"
+
+ cursor = get_cursor()
+ strains = get_strains(cursor)
+ print("Getting probset_vals")
+ probeset_vals = get_probeset_vals(cursor, dataset_name)
+ print("Finished getting probeset_vals")
+ trimmed_strains = trim_strains(strains, probeset_vals)
+ write_data_matrix_file(trimmed_strains, probeset_vals, filename)
+
+if __name__ == '__main__':
+ main()
diff --git a/wqflask/maintenance/quick_search_table.py b/wqflask/maintenance/quick_search_table.py
new file mode 100644
index 00000000..046a05c4
--- /dev/null
+++ b/wqflask/maintenance/quick_search_table.py
@@ -0,0 +1,484 @@
+from __future__ import print_function, division, absolute_import
+
+"""
+Results will be returned for each of several trait types: mRNA assays, phenotypes, genotypes, and
+(maybe later) genes
+
+For each trait type, the results for each species should be given; for example, have a "Mouse" tab
+with the mouse traits in a table inside it
+
+This table will then list each trait, its dataset, and several columns determined by its trait type
+(phenotype, genotype, etc)
+
+"""
+
+
+import sys
+sys.path.append("../../..")
+
+import simplejson as json
+
+import sqlalchemy as sa
+from sqlalchemy.dialects import mysql
+from sqlalchemy.orm import scoped_session, sessionmaker, relationship, backref
+from sqlalchemy.orm.exc import NoResultFound
+from sqlalchemy.ext.declarative import declarative_base
+
+from pprint import pformat as pf
+
+import zach_settings as settings
+
+Engine = sa.create_engine(settings.SQLALCHEMY_DATABASE_URI,
+ #encoding='utf-8',
+ #client_encoding='utf-8',
+ #echo="debug",
+ )
+
+Session = scoped_session(sessionmaker(bind=Engine)) #, extension=VersionedListener()))
+
+Base = declarative_base(bind=Engine)
+Metadata = sa.MetaData()
+Metadata.bind = Engine
+
+class PublishXRef(Base):
+ __tablename__ = 'PublishXRef'
+
+ Id = sa.Column(sa.Integer, primary_key=True)
+ InbredSetId = sa.Column(sa.Integer, primary_key=True)
+ PhenotypeId = sa.Column(sa.Integer)
+ PublicationId = sa.Column(sa.Integer)
+ DataId = sa.Column(sa.Integer)
+ Locus = sa.Column(sa.Text)
+ LRS = sa.Column(sa.Float)
+ additive = sa.Column(sa.Float)
+ Sequence = sa.Column(sa.Integer)
+ comments = sa.Column(sa.Text)
+
+ @classmethod
+ def run(cls):
+ conn = Engine.connect()
+ counter = 0
+ for ps in page_query(Session.query(cls)): #all()
+ values = {}
+ values['table_name'] = cls.__tablename__
+ values['the_key'] = json.dumps([ps.Id, ps.InbredSetId])
+ values['terms'] = cls.get_unique_terms(ps.Id, ps.InbredSetId)
+ print("terms is:", values['terms'])
+ values['result_fields'] = cls.get_result_fields(ps.Id, ps.InbredSetId)
+ ins = QuickSearch.insert().values(**values)
+ conn.execute(ins)
+ counter += 1
+ print("Done:", counter)
+
+ @staticmethod
+ def get_unique_terms(publishxref_id, inbredset_id):
+ results = Session.query(
+ "pre_publication_description",
+ "post_publication_description",
+ "pre_publication_abbreviation",
+ "post_publication_abbreviation",
+ "publication_title"
+ ).from_statement(
+ "SELECT Phenotype.Pre_publication_description as pre_publication_description, "
+ "Phenotype.Post_publication_description as post_publication_description, "
+ "Phenotype.Pre_publication_abbreviation as pre_publication_abbreviation, "
+ "Phenotype.Post_publication_abbreviation as post_publication_abbreviation, "
+ "Publication.Title as publication_title "
+ "FROM Phenotype, Publication, PublishXRef "
+ "WHERE PublishXRef.Id = :publishxref_id and "
+ "PublishXRef.InbredSetId = :inbredset_id and "
+ "PublishXRef.PhenotypeId = Phenotype.Id and "
+ "PublishXRef.PublicationId = Publication.Id ").params(publishxref_id=publishxref_id,
+ inbredset_id=inbredset_id).all()
+
+ unique = set()
+ for item in results[0]:
+ #print("locals:", locals())
+ if not item:
+ continue
+ for token in item.split():
+ if token.startswith(('(','[')):
+ token = token[1:]
+ if token.endswith((')', ']')):
+ token = token[:-1]
+ if token.endswith(';'):
+ token = token[:-1]
+ if len(token) > 2:
+ try:
+ # This hopefully ensures that the token is utf-8
+ token = token.encode('utf-8')
+ print(" ->", token)
+ except UnicodeDecodeError:
+ print("\n-- UDE \n")
+ # Can't get it into utf-8, we won't use it
+ continue
+
+ unique.add(token)
+ #print("\nUnique terms are: {}\n".format(unique))
+ return " ".join(unique)
+
+ @staticmethod
+ def get_result_fields(publishxref_id, inbredset_id):
+ results = Session.query(
+ "phenotype_id",
+ "species",
+ "group_name",
+ "description",
+ "lrs",
+ "publication_id",
+ "pubmed_id",
+ "year",
+ "authors"
+ ).from_statement(
+ "SELECT PublishXRef.PhenotypeId as phenotype_id, "
+ "Species.Name as species, "
+ "InbredSet.Name as group_name, "
+ "Phenotype.Original_description as description, "
+ "PublishXRef.LRS as lrs, "
+ "PublishXRef.PublicationId as publication_id, "
+ "Publication.PubMed_ID"
+ "Publication.Year as year, "
+ "Publication.Authors as authors "
+ "FROM PublishXRef, "
+ "Phenotype, "
+ "Publication, "
+ "InbredSet, "
+ "Species "
+ "WHERE PublishXRef.Id = :publishxref_id and "
+ "PublishXRef.InbredSetId = :inbredset_id and "
+ "PublishXRef.PhenotypeId = Phenotype.Id and "
+ "PublishXRef.PublicationId = Publication.Id and "
+ "InbredSet.Id = :inbredset_id and "
+ "Species.Id = InbredSet.SpeciesId ").params(publishxref_id=publishxref_id,
+ inbredset_id=inbredset_id).all()
+ #"InbredSet.SpeciesId = Species.Id and "
+ #"Geno.SpeciesId = Species.Id and "
+ #"Geno.Name = PublishXRef.Locus ").params(publishxref_id=publishxref_id,
+ # inbredset_id=inbredset_id).all()
+ #for result in results:
+ # print("****", result)
+
+ assert len(set(result for result in results)) == 1, "Different results or no results"
+
+ #print("results are:", results)
+ result = results[0]
+ result = row2dict(result)
+ try:
+ json_results = json.dumps(result, sort_keys=True)
+ except UnicodeDecodeError:
+ print("\n\nTrying to massage unicode\n\n")
+ for key, value in result.iteritems():
+ print("\tkey is:", key)
+ print("\tvalue is:", value)
+ if isinstance(value, basestring):
+ result[key] = value.decode('utf-8', errors='ignore')
+ json_results = json.dumps(result, sort_keys=True)
+
+ return json_results
+
+class GenoXRef(Base):
+ __tablename__ = 'GenoXRef'
+
+ GenoFreezeId = sa.Column(sa.Integer, primary_key=True)
+ GenoId = sa.Column(sa.Integer, primary_key=True)
+ DataId = sa.Column(sa.Integer)
+ cM = sa.Column(sa.Float)
+ Used_for_mapping = sa.Column(sa.Text)
+
+ @classmethod
+ def run(cls):
+ conn = Engine.connect()
+ counter = 0
+ for item in page_query(Session.query(cls)): #all()
+ values = {}
+ values['table_name'] = cls.__tablename__
+ values['the_key'] = json.dumps([item.GenoId, item.GenoFreezeId])
+ values['terms'] = cls.get_unique_terms(item.GenoId)
+ print("terms is:", values['terms'])
+ if values['terms']:
+ values['result_fields'] = cls.get_result_fields(item.GenoId, item.GenoFreezeId)
+ ins = QuickSearch.insert().values(**values)
+ conn.execute(ins)
+ counter += 1
+ print("Done:", counter)
+
+ @staticmethod
+ def get_unique_terms(geno_id):
+ print("geno_id: ", geno_id)
+ results = Session.query(
+ "name",
+ "marker_name"
+ ).from_statement(
+ "SELECT Geno.Name as name, "
+ "Geno.Marker_Name as marker_name "
+ "FROM Geno "
+ "WHERE Geno.Id = :geno_id ").params(geno_id=geno_id).all()
+
+ #print("results: ", pf(results))
+
+ unique = set()
+ if len(results):
+ for item in results[0]:
+ #print("locals:", locals())
+ if not item:
+ continue
+ for token in item.split():
+ if len(token) > 2:
+ try:
+ # This hopefully ensures that the token is utf-8
+ token = token.encode('utf-8')
+ print(" ->", token)
+ except UnicodeDecodeError:
+ print("\n-- UDE \n")
+ # Can't get it into utf-8, we won't use it
+ continue
+
+ unique.add(token)
+ #print("\nUnique terms are: {}\n".format(unique))
+ return " ".join(unique)
+
+
+ @staticmethod
+ def get_result_fields(geno_id, dataset_id):
+ results = Session.query(
+ "name",
+ "marker_name",
+ "group_name",
+ "species",
+ "dataset",
+ "dataset_name",
+ "chr", "mb",
+ "source"
+ ).from_statement(
+ "SELECT Geno.Name as name, "
+ "Geno.Marker_Name as marker_name, "
+ "InbredSet.Name as group_name, "
+ "Species.Name as species, "
+ "GenoFreeze.Name as dataset, "
+ "GenoFreeze.FullName as dataset_name, "
+ "Geno.Chr as chr, "
+ "Geno.Mb as mb, "
+ "Geno.Source as source "
+ "FROM Geno, "
+ "GenoXRef, "
+ "GenoFreeze, "
+ "InbredSet, "
+ "Species "
+ "WHERE Geno.Id = :geno_id and "
+ "GenoXRef.GenoId = Geno.Id and "
+ "GenoFreeze.Id = :dataset_id and "
+ "GenoXRef.GenoFreezeId = GenoFreeze.Id and "
+ "InbredSet.Id = GenoFreeze.InbredSetId and "
+ "InbredSet.SpeciesId = Species.Id ").params(geno_id=geno_id,
+ dataset_id=dataset_id).all()
+ #for result in results:
+ # print(result)
+ assert len(set(result for result in results)) == 1, "Different results"
+
+ #print("results are:", results)
+ result = results[0]
+ result = row2dict(result)
+ try:
+ json_results = json.dumps(result, sort_keys=True)
+ except UnicodeDecodeError:
+ print("\n\nTrying to massage unicode\n\n")
+ for key, value in result.iteritems():
+ print("\tkey is:", key)
+ print("\tvalue is:", value)
+ if isinstance(value, basestring):
+ result[key] = value.decode('utf-8', errors='ignore')
+ json_results = json.dumps(result, sort_keys=True)
+
+ return json_results
+
+class ProbeSetXRef(Base):
+ __tablename__ = 'ProbeSetXRef'
+
+ ProbeSetFreezeId = sa.Column(sa.Integer, primary_key=True)
+ ProbeSetId = sa.Column(sa.Integer, primary_key=True)
+ DataId = sa.Column(sa.Integer, unique=True)
+ Locus_old = sa.Column(sa.Text)
+ LRS_old = sa.Column(sa.Float)
+ pValue_old = sa.Column(sa.Float)
+ mean = sa.Column(sa.Float)
+ se = sa.Column(sa.Float)
+ Locus = sa.Column(sa.Text)
+ LRS = sa.Column(sa.Float)
+ pValue = sa.Column(sa.Float)
+ additive = sa.Column(sa.Float)
+ h2 = sa.Column(sa.Float)
+
+ @classmethod
+ def run(cls):
+ conn = Engine.connect()
+ counter = 0
+ for ps in page_query(Session.query(cls)): #all()
+ values = {}
+ values['table_name'] = cls.__tablename__
+ values['the_key'] = json.dumps([ps.ProbeSetId, ps.ProbeSetFreezeId])
+ values['terms'] = cls.get_unique_terms(ps.ProbeSetId)
+ print("terms is:", values['terms'])
+ #values['species'] = get_species("ProbeSet", ps.Id)
+ values['result_fields'] = cls.get_result_fields(ps.ProbeSetId, ps.ProbeSetFreezeId)
+ if values['result_fields'] == None:
+ continue
+ ins = QuickSearch.insert().values(**values)
+ conn.execute(ins)
+ counter += 1
+ print("Done:", counter)
+
+ @staticmethod
+ def get_unique_terms(probeset_id):
+ results = Session.query(
+ "name",
+ "symbol",
+ "description",
+ "alias"
+ ).from_statement(
+ "SELECT ProbeSet.Name as name, "
+ "ProbeSet.Symbol as symbol, "
+ "ProbeSet.description as description, "
+ "ProbeSet.alias as alias "
+ "FROM ProbeSet "
+ "WHERE ProbeSet.Id = :probeset_id ").params(probeset_id=probeset_id).all()
+
+ unique = set()
+ if len(results):
+ for item in results[0]:
+ #print("locals:", locals())
+ if not item:
+ continue
+ for token in item.split():
+ if token.startswith(('(','[')):
+ token = token[1:]
+ if token.endswith((')', ']')):
+ token = token[:-1]
+ if token.endswith(';'):
+ token = token[:-1]
+ if len(token) > 2:
+ try:
+ # This hopefully ensures that the token is utf-8
+ token = token.encode('utf-8')
+ print(" ->", token)
+ except UnicodeDecodeError:
+ print("\n-- UDE \n")
+ # Can't get it into utf-8, we won't use it
+ continue
+
+ unique.add(token)
+ #print("\nUnique terms are: {}\n".format(unique))
+ return " ".join(unique)
+
+
+ @staticmethod
+ def get_result_fields(probeset_id, dataset_id):
+ print("probeset_id: ", probeset_id)
+ print("dataset_id: ", dataset_id)
+ results = Session.query(
+ "name",
+ "species",
+ "group_name",
+ "dataset",
+ "dataset_name",
+ "symbol",
+ "description",
+ "chr", "mb",
+ "lrs", "mean",
+ "genbank_id",
+ "gene_id",
+ "chip_id",
+ "chip_name"
+ ).from_statement(
+ "SELECT ProbeSet.Name as name, "
+ "Species.Name as species, "
+ "InbredSet.Name as group_name, "
+ "ProbeSetFreeze.Name as dataset, "
+ "ProbeSetFreeze.FullName as dataset_name, "
+ "ProbeSet.Symbol as symbol, "
+ "ProbeSet.description as description, "
+ "ProbeSet.Chr as chr, "
+ "ProbeSet.Mb as mb, "
+ "ProbeSetXRef.LRS as lrs, "
+ "ProbeSetXRef.mean as mean, "
+ "ProbeSet.GenbankId as genbank_id, "
+ "ProbeSet.GeneId as gene_id, "
+ "ProbeSet.ChipId as chip_id, "
+ "GeneChip.Name as chip_name "
+ "FROM ProbeSet, "
+ "ProbeSetXRef, "
+ "ProbeSetFreeze, "
+ "ProbeFreeze, "
+ "InbredSet, "
+ "Species, "
+ "GeneChip "
+ "WHERE ProbeSetXRef.ProbeSetId = :probeset_id and "
+ "ProbeSetXRef.ProbeSetFreezeId = :dataset_id and "
+ "ProbeSetXRef.ProbeSetId = ProbeSet.Id and "
+ "ProbeSet.ChipId = GeneChip.Id and "
+ "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id and "
+ "ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id and "
+ "ProbeFreeze.InbredSetId = InbredSet.Id and "
+ "InbredSet.SpeciesId = Species.Id ").params(probeset_id=probeset_id,
+ dataset_id=dataset_id).all()
+ #for result in results:
+ # print("-", result)
+
+ if len(set(result for result in results)) != 1:
+ return None
+ #assert len(set(result for result in results)) == 1, "Different results"
+
+ #print("results are:", results)
+ result = results[0]
+ result = row2dict(result)
+ try:
+ json_results = json.dumps(result, sort_keys=True)
+ except UnicodeDecodeError:
+ print("\n\nTrying to massage unicode\n\n")
+ for key, value in result.iteritems():
+ print("\tkey is:", key)
+ print("\tvalue is:", value)
+ if isinstance(value, basestring):
+ result[key] = value.decode('utf-8', errors='ignore')
+ json_results = json.dumps(result, sort_keys=True)
+
+ return json_results
+
+
+QuickSearch = sa.Table("QuickSearch", Metadata,
+ sa.Column('table_name', sa.String(15),
+ primary_key=True, nullable=False, autoincrement=False), # table that item is inserted from
+ sa.Column('the_key', sa.String(30),
+ primary_key=True, nullable=False, autoincrement=False), # key in database table
+ sa.Column('terms', sa.Text), # terms to compare search string with
+ sa.Column('result_fields', sa.Text) # json
+ )
+
+QuickSearch.drop(Engine, checkfirst=True)
+Metadata.create_all(Engine)
+
+
+def row2dict(row):
+ """http://stackoverflow.com/a/2848519/1175849"""
+ return dict(zip(row.keys(), row))
+
+
+def page_query(q):
+ """http://stackoverflow.com/a/1217947/1175849"""
+ offset = 0
+ while True:
+ r = False
+ for elem in q.limit(1000).offset(offset):
+ r = True
+ yield elem
+ offset += 1000
+ if not r:
+ break
+
+
+def main():
+ ProbeSetXRef.run()
+ GenoXRef.run()
+ PublishXRef.run()
+
+if __name__ == "__main__":
+ main() \ No newline at end of file