diff options
-rwxr-xr-x | scripts/maintenance/datastructure.py | 177 | ||||
-rwxr-xr-x | scripts/maintenance/load_phenotypes.py | 43 | ||||
-rw-r--r-- | scripts/maintenance/utilities.py | 89 | ||||
-rw-r--r-- | wqflask/wqflask/correlation/correlation_gn3_api.py | 127 | ||||
-rw-r--r-- | wqflask/wqflask/templates/test_correlation_page.html | 8 | ||||
-rw-r--r-- | wqflask/wqflask/templates/tutorials.html | 1 | ||||
-rw-r--r-- | wqflask/wqflask/views.py | 4 |
7 files changed, 386 insertions, 63 deletions
diff --git a/scripts/maintenance/datastructure.py b/scripts/maintenance/datastructure.py new file mode 100755 index 00000000..9f3e8b1e --- /dev/null +++ b/scripts/maintenance/datastructure.py @@ -0,0 +1,177 @@ +import utilities + +def get_probesetfreezes(inbredsetid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT ProbeSetFreeze.`Id`, ProbeSetFreeze.`Name`, ProbeSetFreeze.`FullName` + FROM ProbeSetFreeze, ProbeFreeze + WHERE ProbeSetFreeze.`ProbeFreezeId`=ProbeFreeze.`Id` + AND ProbeFreeze.`InbredSetId`=%s + """ + cursor.execute(sql, (inbredsetid)) + return cursor.fetchall() + +def get_probesetfreeze(probesetfreezeid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT ProbeSetFreeze.`Id`, ProbeSetFreeze.`Name`, ProbeSetFreeze.`FullName` + FROM ProbeSetFreeze + WHERE ProbeSetFreeze.`Id`=%s + """ + cursor.execute(sql, (probesetfreezeid)) + return cursor.fetchone() + +def get_strains(inbredsetid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT Strain.`Id`, Strain.`Name` + FROM StrainXRef, Strain + WHERE StrainXRef.`InbredSetId`=%s + AND StrainXRef.`StrainId`=Strain.`Id` + ORDER BY StrainXRef.`OrderId` + """ + cursor.execute(sql, (inbredsetid)) + return cursor.fetchall() + +def get_inbredset(probesetfreezeid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT InbredSet.`Id`, InbredSet.`Name`, InbredSet.`FullName` + FROM InbredSet, ProbeFreeze, ProbeSetFreeze + WHERE InbredSet.`Id`=ProbeFreeze.`InbredSetId` + AND ProbeFreeze.`Id`=ProbeSetFreeze.`ProbeFreezeId` + AND ProbeSetFreeze.`Id`=%s + """ + cursor.execute(sql, (probesetfreezeid)) + return cursor.fetchone() + +def get_species(inbredsetid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT Species.`Id`, Species.`Name`, Species.`MenuName`, Species.`FullName` + FROM InbredSet, Species + WHERE InbredSet.`Id`=%s + AND InbredSet.`SpeciesId`=Species.`Id` + """ + cursor.execute(sql, (inbredsetid)) + return cursor.fetchone() + +def get_genofreeze_byinbredsetid(inbredsetid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT GenoFreeze.`Id`, GenoFreeze.`Name`, GenoFreeze.`FullName`, GenoFreeze.`InbredSetId` + FROM GenoFreeze + WHERE GenoFreeze.`InbredSetId`=%s + """ + cursor.execute(sql, (inbredsetid)) + return cursor.fetchone() + +def get_nextdataid_genotype(): + cursor, con = utilities.get_cursor() + sql = """ + SELECT GenoData.`Id` + FROM GenoData + ORDER BY GenoData.`Id` DESC + LIMIT 1 + """ + cursor.execute(sql) + re = cursor.fetchone() + dataid = re[0] + dataid += 1 + return dataid + +def get_nextdataid_phenotype(): + cursor, con = utilities.get_cursor() + sql = """ + SELECT PublishData.`Id` + FROM PublishData + ORDER BY PublishData.`Id` DESC + LIMIT 1 + """ + cursor.execute(sql) + re = cursor.fetchone() + dataid = re[0] + dataid += 1 + return dataid + +def get_nextorderid_strainxref(inbredsetid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT StrainXRef.`OrderId` + FROM StrainXRef + WHERE StrainXRef.`InbredSetId`=%s + ORDER BY StrainXRef.`OrderId` DESC + LIMIT 1 + """ + cursor.execute(sql, (inbredsetid)) + re = cursor.fetchone() + if re: + orderid = re[0] + 1 + else: + orderid = 1 + return orderid + +def insert_strain(inbredsetid, strainname): + speciesid = get_species(inbredsetid)[0] + cursor, con = utilities.get_cursor() + sql = """ + INSERT INTO Strain + SET + Strain.`Name`=%s, + Strain.`Name2`=%s, + Strain.`SpeciesId`=%s + """ + cursor.execute(sql, (strainname, strainname, speciesid)) + +def insert_strainxref(inbredsetid, strainid): + orderid = get_nextorderid_strainxref(inbredsetid) + cursor, con = utilities.get_cursor() + sql = """ + INSERT INTO StrainXRef + SET + StrainXRef.`InbredSetId`=%s, + StrainXRef.`StrainId`=%s, + StrainXRef.`OrderId`=%s, + StrainXRef.`Used_for_mapping`=%s, + StrainXRef.`PedigreeStatus`=%s + """ + cursor.execute(sql, (inbredsetid, strainid, orderid, "N", None)) + +def get_strain(inbredsetid, strainname): + speciesid = get_species(inbredsetid)[0] + cursor, con = utilities.get_cursor() + sql = """ + SELECT Strain.`Id`, Strain.`Name` + FROM Strain + WHERE Strain.`SpeciesId`=%s + AND Strain.`Name` LIKE %s + """ + cursor.execute(sql, (speciesid, strainname)) + return cursor.fetchone() + +def get_strainxref(inbredsetid, strainid): + cursor, con = utilities.get_cursor() + sql = """ + SELECT StrainXRef.`StrainId` + FROM StrainXRef + WHERE StrainXRef.`InbredSetId`=%s + AND StrainXRef.`StrainId`=%s + """ + cursor.execute(sql, (inbredsetid, strainid)) + return cursor.fetchone() + +def get_strain_sure(inbredsetid, strainname, updatestrainxref=None): + strain = get_strain(inbredsetid, strainname) + if not strain: + insert_strain(inbredsetid, strainname) + strain = get_strain(inbredsetid, strainname) + strainxref = get_strainxref(inbredsetid, strain[0]) + if not strainxref and updatestrainxref: + insert_strainxref(inbredsetid, strain[0]) + return strain + +def get_strains_bynames(inbredsetid, strainnames, updatestrainxref=None): + strains = [] + for strainname in strainnames: + strains.append(get_strain_sure(inbredsetid, strainname, updatestrainxref)) + return strains diff --git a/scripts/maintenance/load_phenotypes.py b/scripts/maintenance/load_phenotypes.py index 759d2eec..aa02d0cd 100755 --- a/scripts/maintenance/load_phenotypes.py +++ b/scripts/maintenance/load_phenotypes.py @@ -1,3 +1,11 @@ +# Load Python3 environment with GN2 utilities: +# +# source /usr/local/guix-profiles/gn-latest-20210512/etc/profile +# +# and run +# +# python load_phenotypes.py [args...] + import sys import csv @@ -9,35 +17,34 @@ def main(argv): config = utilities.get_config(argv[1]) print("config:") for item in config.items('config'): - print(("\t%s" % (str(item)))) + print("\t%s" % (str(item))) # var inbredsetid = config.get('config', 'inbredsetid') - print(("inbredsetid: %s" % inbredsetid)) + print("inbredsetid: %s" % inbredsetid) species = datastructure.get_species(inbredsetid) speciesid = species[0] - print(("speciesid: %s" % speciesid)) + print("speciesid: %s" % speciesid) dataid = datastructure.get_nextdataid_phenotype() - print(("next data id: %s" % dataid)) + print("next data id: %s" % dataid) cursor, con = utilities.get_cursor() # datafile datafile = open(config.get('config', 'datafile'), 'r') phenotypedata = csv.reader(datafile, delimiter='\t', quotechar='"') - phenotypedata_head = next(phenotypedata) - print(("phenotypedata head:\n\t%s" % phenotypedata_head)) + phenotypedata_head = phenotypedata.next() + print("phenotypedata head:\n\t%s" % phenotypedata_head) strainnames = phenotypedata_head[1:] strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes") # metafile metafile = open(config.get('config', 'metafile'), 'r') phenotypemeta = csv.reader(metafile, delimiter='\t', quotechar='"') - phenotypemeta_head = next(phenotypemeta) - print(("phenotypemeta head:\n\t%s" % phenotypemeta_head)) - print() + phenotypemeta_head = phenotypemeta.next() + print("phenotypemeta head:\n\t%s" % phenotypemeta_head) # load for metarow in phenotypemeta: # - datarow_value = next(phenotypedata) - datarow_se = next(phenotypedata) - datarow_n = next(phenotypedata) + datarow_value = phenotypedata.next() + datarow_se = phenotypedata.next() + datarow_n = phenotypedata.next() # Phenotype sql = """ INSERT INTO Phenotype @@ -67,7 +74,7 @@ def main(argv): )) rowcount = cursor.rowcount phenotypeid = con.insert_id() - print(("INSERT INTO Phenotype: %d record: %d" % (rowcount, phenotypeid))) + print("INSERT INTO Phenotype: %d record: %d" % (rowcount, phenotypeid)) # Publication publicationid = None # reset pubmed_id = utilities.to_db_string(metarow[0], None) @@ -81,7 +88,7 @@ def main(argv): re = cursor.fetchone() if re: publicationid = re[0] - print(("get Publication record: %d" % publicationid)) + print("get Publication record: %d" % publicationid) if not publicationid: sql = """ INSERT INTO Publication @@ -109,7 +116,7 @@ def main(argv): )) rowcount = cursor.rowcount publicationid = con.insert_id() - print(("INSERT INTO Publication: %d record: %d" % (rowcount, publicationid))) + print("INSERT INTO Publication: %d record: %d" % (rowcount, publicationid)) # data for index, strain in enumerate(strains): # @@ -158,14 +165,14 @@ def main(argv): cursor.execute(sql, (inbredsetid, phenotypeid, publicationid, dataid, "")) rowcount = cursor.rowcount publishxrefid = con.insert_id() - print(("INSERT INTO PublishXRef: %d record: %d" % (rowcount, publishxrefid))) + print("INSERT INTO PublishXRef: %d record: %d" % (rowcount, publishxrefid)) # for loop next dataid += 1 - print() + print # release con.close() if __name__ == "__main__": - print(("command line arguments:\n\t%s" % sys.argv)) + print("command line arguments:\n\t%s" % sys.argv) main(sys.argv) print("exit successfully") diff --git a/scripts/maintenance/utilities.py b/scripts/maintenance/utilities.py new file mode 100644 index 00000000..886410c2 --- /dev/null +++ b/scripts/maintenance/utilities.py @@ -0,0 +1,89 @@ +import MySQLdb +import re +import configparser + +def get_cursor(): + host = 'tux.uthsc.edu' + user = 'webqtlout' + passwd = 'webqtlout' + db = 'db_webqtl' + con = MySQLdb.Connect(db=db, host=host, user=user, passwd=passwd) + cursor = con.cursor() + return cursor, con + +def clearspaces(s, default=None): + if s: + s = re.sub('\s+', ' ', s) + s = s.strip() + return s + else: + return default + +def to_dic(keys, values): + dic = {} + for i in range(len(keys)): + key = keys[i] + value = values[i] + dic[key] = value + return dic + +def overlap(dic1, dic2): + keys = [] + values1 = [] + values2 = [] + for key in dic1.keys(): + if key in dic2: + value1 = dic1[key] + value2 = dic2[key] + if value1 and value2: + keys.append(key) + values1.append(value1) + values2.append(value2) + return keys, values1, values2 + +def to_db_string(s, default): + if s: + s = s.strip() + if len(s) == 0: + return default + elif s == 'x': + return default + else: + return s + else: + return default + +def to_db_float(s, default): + if s: + s = s.strip() + if len(s) == 0: + return default + elif s == 'x': + return default + else: + try: + return float(s) + except: + return default + else: + return default + +def to_db_int(s, default): + if s: + s = s.strip() + if len(s) == 0: + return default + elif s == 'x': + return default + else: + try: + return int(s) + except: + return default + else: + return default + +def get_config(configfile): + config = configparser.ConfigParser() + config.read(configfile) + return config diff --git a/wqflask/wqflask/correlation/correlation_gn3_api.py b/wqflask/wqflask/correlation/correlation_gn3_api.py index 98d52591..46202ca3 100644 --- a/wqflask/wqflask/correlation/correlation_gn3_api.py +++ b/wqflask/wqflask/correlation/correlation_gn3_api.py @@ -41,27 +41,33 @@ def process_samples(start_vars, sample_names, excluded_samples=None): return sample_data +def merge_correlation_results(correlation_results, target_correlation_results): + + corr_dict = {} + + for trait_dict in target_correlation_results: + for trait_name, values in trait_dict.items(): + + corr_dict[trait_name] = values + for trait_dict in correlation_results: + for trait_name, values in trait_dict.items(): + + if corr_dict.get(trait_name): + + trait_dict[trait_name].update(corr_dict.get(trait_name)) + + return correlation_results + + def sample_for_trait_lists(corr_results, target_dataset, this_trait, this_dataset, start_vars): """interface function for correlation on top results""" - sample_data = process_samples( - start_vars, this_dataset.group.samplelist) - target_dataset.get_trait_data(list(sample_data.keys())) - # should filter target traits from here - _corr_results = corr_results - - this_trait = retrieve_sample_data(this_trait, this_dataset) - - this_trait_data = { - "trait_sample_data": sample_data, - "trait_id": start_vars["trait_id"] - } - results = map_shared_keys_to_values( - target_dataset.samplelist, target_dataset.trait_data) + (this_trait_data, target_dataset) = fetch_sample_data( + start_vars, this_trait, this_dataset, target_dataset) correlation_results = compute_all_sample_correlation(corr_method="pearson", this_trait=this_trait_data, - target_dataset=results) + target_dataset=target_dataset) return correlation_results @@ -105,6 +111,23 @@ def lit_for_trait_list(corr_results, this_dataset, this_trait): return correlation_results +def fetch_sample_data(start_vars, this_trait, this_dataset, target_dataset): + + sample_data = process_samples( + start_vars, this_dataset.group.samplelist) + target_dataset.get_trait_data(list(sample_data.keys())) + this_trait = retrieve_sample_data(this_trait, this_dataset) + this_trait_data = { + "trait_sample_data": sample_data, + "trait_id": start_vars["trait_id"] + } + + results = map_shared_keys_to_values( + target_dataset.samplelist, target_dataset.trait_data) + + return (this_trait_data, results) + + def compute_correlation(start_vars, method="pearson"): """compute correlation for to call gn3 api""" # pylint: disable-msg=too-many-locals @@ -114,36 +137,19 @@ def compute_correlation(start_vars, method="pearson"): (this_dataset, this_trait, target_dataset, sample_data) = create_target_this_trait(start_vars) + target_dataset_type = target_dataset.type + this_dataset_type = this_dataset.type + method = start_vars['corr_sample_method'] corr_return_results = int(start_vars.get("corr_return_results", 100)) corr_input_data = {} if corr_type == "sample": - - sample_data = process_samples( - start_vars, this_dataset.group.samplelist) - target_dataset.get_trait_data(list(sample_data.keys())) - this_trait = retrieve_sample_data(this_trait, this_dataset) - this_trait_data = { - "trait_sample_data": sample_data, - "trait_id": start_vars["trait_id"] - } - results = map_shared_keys_to_values( - target_dataset.samplelist, target_dataset.trait_data) + (this_trait_data, target_dataset_data) = fetch_sample_data( + start_vars, this_trait, this_dataset, target_dataset) correlation_results = compute_all_sample_correlation(corr_method=method, this_trait=this_trait_data, - target_dataset=results) - - # do tissue correaltion - - # code to be use later - - # tissue_result = tissue_for_trait_lists( - # correlation_results, this_dataset, this_trait) - # # lit spoils the party so slow - # lit_result = lit_for_trait_list( - # correlation_results, this_dataset, this_trait) - + target_dataset=target_dataset_data) elif corr_type == "tissue": trait_symbol_dict = this_dataset.retrieve_genes("Symbol") @@ -172,7 +178,50 @@ def compute_correlation(start_vars, method="pearson"): conn=conn, trait_lists=list(geneid_dict.items()), species=species, gene_id=this_trait_geneid) - return correlation_results[0:corr_return_results] + correlation_results = correlation_results[0:corr_return_results] + + compute_all = True # later to be passed as argument + + if (compute_all): + + correlation_results = compute_corr_for_top_results(correlation_results, + this_trait, + this_dataset, + target_dataset, + corr_type) + + correlation_data = {"correlation_results": correlation_results, + "this_trait": this_trait.name, + "target_dataset": start_vars['corr_dataset'], + "return_results": corr_return_results} + + return correlation_data + + +def compute_corr_for_top_results(correlation_results, + this_trait, + this_dataset, + target_dataset, + corr_type): + if corr_type != "tissue" and this_dataset.type == "ProbeSet" and target_dataset.type == "ProbeSet": + + tissue_result = tissue_for_trait_lists( + correlation_results, this_dataset, this_trait) + + correlation_results = merge_correlation_results( + correlation_results, tissue_result) + + if corr_type != "lit" and this_dataset.type == "ProbeSet" and target_dataset.type == "ProbeSet": + lit_result = lit_for_trait_list( + correlation_results, this_dataset, this_trait) + + correlation_results = merge_correlation_results( + correlation_results, lit_result) + + if corr_type != "sample": + pass + + return correlation_results def do_lit_correlation(this_trait, this_dataset): diff --git a/wqflask/wqflask/templates/test_correlation_page.html b/wqflask/wqflask/templates/test_correlation_page.html index 037e9735..0809b65e 100644 --- a/wqflask/wqflask/templates/test_correlation_page.html +++ b/wqflask/wqflask/templates/test_correlation_page.html @@ -42,7 +42,7 @@ {% block content %} <div class="correlation-title"> - <h3>Correlation Results for <span>Dataset_name</span> against <span><a href="">trait_name</a></span> for the top <span>all</span> Results</h3> + <h3>Correlation Results for <span>{{target_dataset}}</span> against <span><a href="">{{this_trait}}</a></span> for the top <span>{{return_results}}</span> Results</h3> </div> <div class="header-toggle-vis"> <h4 style="font-weight: bolder;padding: 5px 3px;">Toggle Columns</h4> @@ -84,7 +84,6 @@ <script language="javascript" type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.1/js/all.min.js"></script> <script language="javascript" type="text/javascript" src="{{ url_for('js', filename='DataTablesExtensions/scroller/js/dataTables.scroller.min.js') }}"></script> <script type="text/javascript"> - console.log("running this script") let correlationResults = {{correlation_results|safe}} // document.querySelector(".content").innerHTML =correlationResults // parse the data @@ -102,7 +101,7 @@ return new_dict; }) -console.log(correlationResults) +console.log(correlationResults) </script> @@ -138,7 +137,8 @@ console.log(correlationResults) { "targets":2, "render":(data,type,row)=>{ - let urlLink = "/show_trait?trait_id=1453207_at&dataset=HC_M2_0606_P" + // should use a dynamic dataset name + let urlLink = `/show_trait?trait_id=${data}&dataset=HC_M2_0606_P` let traitLink = `<a href=${urlLink}>${data}</a>` return traitLink }, diff --git a/wqflask/wqflask/templates/tutorials.html b/wqflask/wqflask/templates/tutorials.html index ce5d0e3d..ed1b6f3a 100644 --- a/wqflask/wqflask/templates/tutorials.html +++ b/wqflask/wqflask/templates/tutorials.html @@ -10,6 +10,7 @@ <LI><A HREF="http://www.nervenet.org/tutorials/HS_Rat_Using_GeneNetwork_21Apr2020v7.pptx">Statistical and genetic functions, and initial mapping results for Rat GWAS P50 as implemented in GeneNetwork.org</A></LI> <LI><A HREF="https://opar.io/training/osga-webinar-series-2020.html">Webinar Series - Quantitative Genetics Tools for Mapping Trait Variation to Mechanisms, Therapeutics, and Interventions</A></LI> + <LI><A HREF="https://opar.io/training/osga-webinar-series-2020.html">TESTING</A></LI> </UL> <P></P> </TD> diff --git a/wqflask/wqflask/views.py b/wqflask/wqflask/views.py index 3c875163..4834ee63 100644 --- a/wqflask/wqflask/views.py +++ b/wqflask/wqflask/views.py @@ -972,8 +972,8 @@ def corr_compute_page(): @app.route("/test_corr_compute", methods=["POST"]) def test_corr_compute_page(): - correlation_results = compute_correlation(request.form) - return render_template("test_correlation_page.html", correlation_results=correlation_results) + correlation_data = compute_correlation(request.form) + return render_template("test_correlation_page.html", **correlation_data) @app.route("/corr_matrix", methods=('POST',)) def corr_matrix_page(): |