aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZachary Sloan2013-07-23 17:12:29 -0500
committerZachary Sloan2013-07-23 17:12:29 -0500
commit930d8b0cf0c7bf88fee1de95852dfe883418a494 (patch)
tree04d542a01dba0c4529f55912ab242a3bc61459aa
parent82f493650909e2351035e26e9dc82b16498beb48 (diff)
downloadgenenetwork2-930d8b0cf0c7bf88fee1de95852dfe883418a494.tar.gz
Fixed a couple issues with he quick_search_table.py script
Added notes on using percona to optimise mysql to gn_installation_notes.txt Started moving "get_trait_data" to DataSet since it is the same regardless of the particular DataSet subclass (PhenotypeDataSet, MrnaAssayDataSet, etc)
-rw-r--r--misc/gn_installation_notes.txt8
-rw-r--r--misc/notes.txt15
-rwxr-xr-xwqflask/base/data_set.py202
-rw-r--r--wqflask/maintenance/quick_search_table.py62
-rw-r--r--wqflask/wqflask/my_pylmm/pyLMM/lmm.py3
5 files changed, 194 insertions, 96 deletions
diff --git a/misc/gn_installation_notes.txt b/misc/gn_installation_notes.txt
index fb38844f..91fe1781 100644
--- a/misc/gn_installation_notes.txt
+++ b/misc/gn_installation_notes.txt
@@ -172,6 +172,14 @@ Setup accounts in MySQL (first need to delete anonymous/non-root accounts):
#; GRANT ALL ON *.* TO 'yourusername'@'%' IDENTIFIED BY 'yourpassword';
#; select * from user;
+Use Percona to optimise my.cfg file:
+https://tools.percona.com/wizard
+
+To check # CPUs:
+grep -c processor /proc/cpuinfo
+
+To check memory, can use htop
+
============================================
Check RSA key:
diff --git a/misc/notes.txt b/misc/notes.txt
index d9bd6e41..9e61e527 100644
--- a/misc/notes.txt
+++ b/misc/notes.txt
@@ -269,6 +269,21 @@ grep -ir (search string) (directory)
===========================================
+Change owner/group:
+
+chown zas1024 somefile (change owner of somefile to zas1024)
+chown zas1024: somefile (change owner and group of somefile to zas1024)
+sudo chown root: somefile (change owner/group of somefile to root)
+
+Change permissions:
+
+chmod g-wr (remove write/read privileges for group)
+chmod g+w (add write privileges for group)
+chmod u+rwx (add read, write, and execute privileges for user)
+etc.
+
+===========================================
+
MySQL Things -
Create fulltext (MATCH AGAINST syntax) index:
diff --git a/wqflask/base/data_set.py b/wqflask/base/data_set.py
index d5aae31d..188a94a2 100755
--- a/wqflask/base/data_set.py
+++ b/wqflask/base/data_set.py
@@ -435,6 +435,74 @@ class DataSet(object):
except TypeError:
print("Dataset {} is not yet available in GeneNetwork.".format(self.name))
pass
+
+ def get_trait_data(self):
+ self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+ query = """
+ SELECT Strain.Name, Strain.Id FROM Strain, Species
+ WHERE Strain.Name IN {}
+ and Strain.SpeciesId=Species.Id
+ and Species.name = '{}'
+ """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+ results = dict(g.db.execute(query).fetchall())
+ sample_ids = [results[item] for item in self.samplelist]
+
+ # MySQL limits the number of tables that can be used in a join to 61,
+ # so we break the sample ids into smaller chunks
+ # Postgres doesn't have that limit, so we can get rid of this after we transition
+ chunk_size = 50
+ number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+ trait_sample_data = []
+ for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+
+ #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
+ #tempTable = None
+ #if GeneId and db.type == "ProbeSet":
+ # if method == "3":
+ # tempTable = self.getTempLiteratureTable(species=species,
+ # input_species_geneid=GeneId,
+ # returnNumber=returnNumber)
+ #
+ # if method == "4" or method == "5":
+ # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+ # TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+ # method=method,
+ # returnNumber=returnNumber)
+
+ temp = ['T%s.value' % item for item in sample_ids_step]
+ query = "SELECT {}.Name,".format(escape(self.type))
+ data_start_pos = 1
+ query += string.join(temp, ', ')
+ query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+ self.type,
+ self.type))
+
+ for item in sample_ids_step:
+ query += """
+ left join {}Data as T{} on T{}.Id = {}XRef.DataId
+ and T{}.StrainId={}\n
+ """.format(*mescape(self.type, item, item, self.type, item, item))
+
+ query += """
+ WHERE {}XRef.{}FreezeId = {}Freeze.Id
+ and {}Freeze.Name = '{}'
+ and {}.Id = {}XRef.{}Id
+ order by {}.Id
+ """.format(*mescape(self.type, self.type, self.type, self.type,
+ self.name, self.type, self.type, self.type, self.type))
+ results = g.db.execute(query).fetchall()
+ trait_sample_data.append(results)
+
+ trait_count = len(trait_sample_data[0])
+ self.trait_data = collections.defaultdict(list)
+
+ # put all of the separate data together into a dictionary where the keys are
+ # trait names and values are lists of sample values
+ for trait_counter in range(trait_count):
+ trait_name = trait_sample_data[0][trait_counter][0]
+ for chunk_counter in range(int(number_chunks)):
+ self.trait_data[trait_name] += (
+ trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
class PhenotypeDataSet(DataSet):
DS_NAME_MAP['Publish'] = 'PhenotypeDataSet'
@@ -768,73 +836,73 @@ class MrnaAssayDataSet(DataSet):
#print("After retrieve_sample_data")
return trait_data
- def get_trait_data(self):
- self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
- query = """
- SELECT Strain.Name, Strain.Id FROM Strain, Species
- WHERE Strain.Name IN {}
- and Strain.SpeciesId=Species.Id
- and Species.name = '{}'
- """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
- results = dict(g.db.execute(query).fetchall())
- sample_ids = [results[item] for item in self.samplelist]
-
- # MySQL limits the number of tables that can be used in a join to 61,
- # so we break the sample ids into smaller chunks
- # Postgres doesn't have that limit, so we can get rid of this after we transition
- chunk_size = 50
- number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
- trait_sample_data = []
- for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
-
- #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
- #tempTable = None
- #if GeneId and db.type == "ProbeSet":
- # if method == "3":
- # tempTable = self.getTempLiteratureTable(species=species,
- # input_species_geneid=GeneId,
- # returnNumber=returnNumber)
- #
- # if method == "4" or method == "5":
- # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
- # TissueProbeSetFreezeId=tissueProbeSetFreezeId,
- # method=method,
- # returnNumber=returnNumber)
-
- temp = ['T%s.value' % item for item in sample_ids_step]
- query = "SELECT {}.Name,".format(escape(self.type))
- data_start_pos = 1
- query += string.join(temp, ', ')
- query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
- self.type,
- self.type))
-
- for item in sample_ids_step:
- query += """
- left join {}Data as T{} on T{}.Id = {}XRef.DataId
- and T{}.StrainId={}\n
- """.format(*mescape(self.type, item, item, self.type, item, item))
-
- query += """
- WHERE {}XRef.{}FreezeId = {}Freeze.Id
- and {}Freeze.Name = '{}'
- and {}.Id = {}XRef.{}Id
- order by {}.Id
- """.format(*mescape(self.type, self.type, self.type, self.type,
- self.name, self.type, self.type, self.type, self.type))
- results = g.db.execute(query).fetchall()
- trait_sample_data.append(results)
-
- trait_count = len(trait_sample_data[0])
- self.trait_data = collections.defaultdict(list)
-
- # put all of the separate data together into a dictionary where the keys are
- # trait names and values are lists of sample values
- for trait_counter in range(trait_count):
- trait_name = trait_sample_data[0][trait_counter][0]
- for chunk_counter in range(int(number_chunks)):
- self.trait_data[trait_name] += (
- trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
+ #def get_trait_data(self):
+ # self.samplelist = self.group.samplelist + self.group.parlist + self.group.f1list
+ # query = """
+ # SELECT Strain.Name, Strain.Id FROM Strain, Species
+ # WHERE Strain.Name IN {}
+ # and Strain.SpeciesId=Species.Id
+ # and Species.name = '{}'
+ # """.format(create_in_clause(self.samplelist), *mescape(self.group.species))
+ # results = dict(g.db.execute(query).fetchall())
+ # sample_ids = [results[item] for item in self.samplelist]
+ #
+ # # MySQL limits the number of tables that can be used in a join to 61,
+ # # so we break the sample ids into smaller chunks
+ # # Postgres doesn't have that limit, so we can get rid of this after we transition
+ # chunk_size = 50
+ # number_chunks = int(math.ceil(len(sample_ids) / chunk_size))
+ # trait_sample_data = []
+ # for sample_ids_step in chunks.divide_into_chunks(sample_ids, number_chunks):
+ #
+ # #XZ, 09/24/2008: build one temporary table that only contains the records associated with the input GeneId
+ # #tempTable = None
+ # #if GeneId and db.type == "ProbeSet":
+ # # if method == "3":
+ # # tempTable = self.getTempLiteratureTable(species=species,
+ # # input_species_geneid=GeneId,
+ # # returnNumber=returnNumber)
+ # #
+ # # if method == "4" or method == "5":
+ # # tempTable = self.getTempTissueCorrTable(primaryTraitSymbol=GeneSymbol,
+ # # TissueProbeSetFreezeId=tissueProbeSetFreezeId,
+ # # method=method,
+ # # returnNumber=returnNumber)
+ #
+ # temp = ['T%s.value' % item for item in sample_ids_step]
+ # query = "SELECT {}.Name,".format(escape(self.type))
+ # data_start_pos = 1
+ # query += string.join(temp, ', ')
+ # query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(self.type,
+ # self.type,
+ # self.type))
+ #
+ # for item in sample_ids_step:
+ # query += """
+ # left join {}Data as T{} on T{}.Id = {}XRef.DataId
+ # and T{}.StrainId={}\n
+ # """.format(*mescape(self.type, item, item, self.type, item, item))
+ #
+ # query += """
+ # WHERE {}XRef.{}FreezeId = {}Freeze.Id
+ # and {}Freeze.Name = '{}'
+ # and {}.Id = {}XRef.{}Id
+ # order by {}.Id
+ # """.format(*mescape(self.type, self.type, self.type, self.type,
+ # self.name, self.type, self.type, self.type, self.type))
+ # results = g.db.execute(query).fetchall()
+ # trait_sample_data.append(results)
+ #
+ # trait_count = len(trait_sample_data[0])
+ # self.trait_data = collections.defaultdict(list)
+ #
+ # # put all of the separate data together into a dictionary where the keys are
+ # # trait names and values are lists of sample values
+ # for trait_counter in range(trait_count):
+ # trait_name = trait_sample_data[0][trait_counter][0]
+ # for chunk_counter in range(int(number_chunks)):
+ # self.trait_data[trait_name] += (
+ # trait_sample_data[chunk_counter][trait_counter][data_start_pos:])
def get_trait_info(self, trait_list=None, species=''):
diff --git a/wqflask/maintenance/quick_search_table.py b/wqflask/maintenance/quick_search_table.py
index b07e7656..9cd792ef 100644
--- a/wqflask/maintenance/quick_search_table.py
+++ b/wqflask/maintenance/quick_search_table.py
@@ -71,9 +71,10 @@ class PublishXRef(Base):
values['the_key'] = json.dumps([pub_row.Id, pub_row.InbredSetId])
values['terms'] = cls.get_unique_terms(pub_row.Id, pub_row.InbredSetId)
print("terms is:", values['terms'])
- values['result_fields'] = cls.get_result_fields(pub_row.Id, pub_row.InbredSetId)
- ins = QuickSearch.insert().values(**values)
- conn.execute(ins)
+ if values['terms']:
+ values['result_fields'] = cls.get_result_fields(pub_row.Id, pub_row.InbredSetId)
+ ins = QuickSearch.insert().values(**values)
+ conn.execute(ins)
counter += 1
print("Done:", counter)
@@ -100,28 +101,30 @@ class PublishXRef(Base):
inbredset_id=inbredset_id).all()
unique = set()
- for item in results[0]:
- #print("locals:", locals())
- if not item:
- continue
- for token in item.split():
- if token.startswith(('(','[')):
- token = token[1:]
- if token.endswith((')', ']')):
- token = token[:-1]
- if token.endswith(';'):
- token = token[:-1]
- if len(token) > 2:
- try:
- # This hopefully ensures that the token is utf-8
- token = token.encode('utf-8')
- print(" ->", token)
- except UnicodeDecodeError:
- print("\n-- UDE \n")
- # Can't get it into utf-8, we won't use it
- continue
-
- unique.add(token)
+ print("results: ", results)
+ if len(results):
+ for item in results[0]:
+ #print("locals:", locals())
+ if not item:
+ continue
+ for token in item.split():
+ if token.startswith(('(','[')):
+ token = token[1:]
+ if token.endswith((')', ']')):
+ token = token[:-1]
+ if token.endswith(';'):
+ token = token[:-1]
+ if len(token) > 2:
+ try:
+ # This hopefully ensures that the token is utf-8
+ token = token.encode('utf-8')
+ print(" ->", token)
+ except UnicodeDecodeError:
+ print("\n-- UDE \n")
+ # Can't get it into utf-8, we won't use it
+ continue
+
+ unique.add(token)
#print("\nUnique terms are: {}\n".format(unique))
return " ".join(unique)
@@ -467,8 +470,8 @@ QuickSearch = sa.Table("QuickSearch", Metadata,
mysql_engine = 'MyISAM',
)
-#QuickSearch.drop(Engine, checkfirst=True)
-#Metadata.create_all(Engine)
+QuickSearch.drop(Engine, checkfirst=True)
+Metadata.create_all(Engine)
def row2dict(row):
@@ -495,9 +498,10 @@ def main():
Add all items from the ProbeSetXRef, GenoXRef, and PublishXRef tables to the QuickSearch tables.
"""
+
+ GenoXRef.run()
+ PublishXRef.run()
ProbeSetXRef.run()
- #GenoXRef.run()
- #PublishXRef.run()
if __name__ == "__main__":
main() \ No newline at end of file
diff --git a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
index 38a49f12..446683a4 100644
--- a/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
+++ b/wqflask/wqflask/my_pylmm/pyLMM/lmm.py
@@ -237,6 +237,9 @@ def run(pheno_vector,
with Bench("Calculate Kinship"):
kinship_matrix = calculate_kinship(genotype_matrix, temp_data)
+ print("kinship_matrix: ", pf(kinship_matrix))
+ print("kinship_matrix.shape: ", pf(kinship_matrix.shape))
+
with Bench("Create LMM object"):
lmm_ob = LMM(pheno_vector, kinship_matrix)