diff options
author | Arun Isaac | 2023-12-29 18:55:37 +0000 |
---|---|---|
committer | Arun Isaac | 2023-12-29 19:01:46 +0000 |
commit | 204a308be0f741726b9a620d88fbc22b22124c81 (patch) | |
tree | b3cf66906674020b530c844c2bb4982c8a0e2d39 /gn2/wqflask/api | |
parent | 83062c75442160427b50420161bfcae2c5c34c84 (diff) | |
download | genenetwork2-204a308be0f741726b9a620d88fbc22b22124c81.tar.gz |
Namespace all modules under gn2.
We move all modules under a gn2 directory. This is important for
"correct" packaging and deployment as a Guix service.
Diffstat (limited to 'gn2/wqflask/api')
-rw-r--r-- | gn2/wqflask/api/__init__.py | 0 | ||||
-rw-r--r-- | gn2/wqflask/api/correlation.py | 244 | ||||
-rw-r--r-- | gn2/wqflask/api/gen_menu.py | 217 | ||||
-rw-r--r-- | gn2/wqflask/api/jobs.py | 54 | ||||
-rw-r--r-- | gn2/wqflask/api/mapping.py | 186 | ||||
-rw-r--r-- | gn2/wqflask/api/markdown.py | 186 | ||||
-rw-r--r-- | gn2/wqflask/api/router.py | 1037 |
7 files changed, 1924 insertions, 0 deletions
diff --git a/gn2/wqflask/api/__init__.py b/gn2/wqflask/api/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/gn2/wqflask/api/__init__.py diff --git a/gn2/wqflask/api/correlation.py b/gn2/wqflask/api/correlation.py new file mode 100644 index 00000000..090d13ac --- /dev/null +++ b/gn2/wqflask/api/correlation.py @@ -0,0 +1,244 @@ +import collections +import scipy +import numpy + +from gn2.base import data_set +from gn2.base.trait import create_trait, retrieve_sample_data +from gn2.utility import corr_result_helpers +from gn2.utility.tools import get_setting +from gn2.wqflask.correlation import correlation_functions +from gn2.wqflask.database import database_connection + +def do_correlation(start_vars): + if 'db' not in start_vars: + raise ValueError("'db' not found!") + if 'target_db' not in start_vars: + raise ValueError("'target_db' not found!") + if 'trait_id' not in start_vars: + raise ValueError("'trait_id' not found!") + + this_dataset = data_set.create_dataset(dataset_name=start_vars['db']) + target_dataset = data_set.create_dataset( + dataset_name=start_vars['target_db']) + this_trait = create_trait(dataset=this_dataset, + name=start_vars['trait_id']) + this_trait = retrieve_sample_data(this_trait, this_dataset) + + corr_params = init_corr_params(start_vars) + + corr_results = calculate_results( + this_trait, this_dataset, target_dataset, corr_params) + + final_results = [] + for _trait_counter, trait in enumerate(list(corr_results.keys())[:corr_params['return_count']]): + if corr_params['type'] == "tissue": + [sample_r, num_overlap, sample_p, symbol] = corr_results[trait] + result_dict = { + "trait": trait, + "sample_r": sample_r, + "#_strains": num_overlap, + "p_value": sample_p, + "symbol": symbol + } + elif corr_params['type'] == "literature" or corr_params['type'] == "lit": + [gene_id, sample_r] = corr_results[trait] + result_dict = { + "trait": trait, + "sample_r": sample_r, + "gene_id": gene_id + } + else: + [sample_r, sample_p, num_overlap] = corr_results[trait] + result_dict = { + "trait": trait, + "sample_r": sample_r, + "#_strains": num_overlap, + "p_value": sample_p + } + final_results.append(result_dict) + return final_results + + +def calculate_results(this_trait, this_dataset, target_dataset, corr_params): + corr_results = {} + + target_dataset.get_trait_data() + + if corr_params['type'] == "tissue": + trait_symbol_dict = this_dataset.retrieve_genes("Symbol") + corr_results = do_tissue_correlation_for_all_traits( + this_trait, trait_symbol_dict, corr_params) + sorted_results = collections.OrderedDict(sorted(list(corr_results.items()), + key=lambda t: -abs(t[1][1]))) + # ZS: Just so a user can use either "lit" or "literature" + elif corr_params['type'] == "literature" or corr_params['type'] == "lit": + trait_geneid_dict = this_dataset.retrieve_genes("GeneId") + corr_results = do_literature_correlation_for_all_traits( + this_trait, this_dataset, trait_geneid_dict, corr_params) + sorted_results = collections.OrderedDict(sorted(list(corr_results.items()), + key=lambda t: -abs(t[1][1]))) + else: + for target_trait, target_vals in list(target_dataset.trait_data.items()): + result = get_sample_r_and_p_values( + this_trait, this_dataset, target_vals, target_dataset, corr_params['type']) + if result is not None: + corr_results[target_trait] = result + + sorted_results = collections.OrderedDict( + sorted(list(corr_results.items()), key=lambda t: -abs(t[1][0]))) + + return sorted_results + + +def do_tissue_correlation_for_all_traits(this_trait, trait_symbol_dict, corr_params, tissue_dataset_id=1): + # Gets tissue expression values for the primary trait + primary_trait_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values( + symbol_list=[this_trait.symbol]) + + if this_trait.symbol.lower() in primary_trait_tissue_vals_dict: + primary_trait_tissue_values = primary_trait_tissue_vals_dict[this_trait.symbol.lower( + )] + + corr_result_tissue_vals_dict = correlation_functions.get_trait_symbol_and_tissue_values( + symbol_list=list(trait_symbol_dict.values())) + + tissue_corr_data = {} + for trait, symbol in list(trait_symbol_dict.items()): + if symbol and symbol.lower() in corr_result_tissue_vals_dict: + this_trait_tissue_values = corr_result_tissue_vals_dict[symbol.lower( + )] + + result = correlation_functions.cal_zero_order_corr_for_tiss(primary_trait_tissue_values, + this_trait_tissue_values, + corr_params['method']) + + tissue_corr_data[trait] = [ + result[0], result[1], result[2], symbol] + + return tissue_corr_data + + +def do_literature_correlation_for_all_traits(this_trait, target_dataset, trait_geneid_dict, corr_params): + input_trait_mouse_gene_id = convert_to_mouse_gene_id( + target_dataset.group.species.lower(), this_trait.geneid) + + lit_corr_data = {} + for trait, gene_id in list(trait_geneid_dict.items()): + mouse_gene_id = convert_to_mouse_gene_id( + target_dataset.group.species.lower(), gene_id) + + if mouse_gene_id and str(mouse_gene_id).find(";") == -1: + result = "" + with database_connection(get_setting("SQL_URI")) as conn: + with conn.cursor() as cursor: + cursor.execute( + ("SELECT value FROM LCorrRamin3 " + "WHERE GeneId1=%s AND GeneId2=%s"), + (mouse_gene_id, + input_trait_mouse_gene_id)) + result = cursor.fetchone() + if not result: + cursor.execute( + ("SELECT value FROM LCorrRamin3 " + "WHERE GeneId2=%s AND GeneId1=%s"), + (mouse_gene_id, + input_trait_mouse_gene_id)) + result = cursor.fetchone() + if result: + lit_corr = result[0] + lit_corr_data[trait] = [gene_id, lit_corr] + else: + lit_corr_data[trait] = [gene_id, 0] + else: + lit_corr_data[trait] = [gene_id, 0] + + return lit_corr_data + + +def get_sample_r_and_p_values(this_trait, this_dataset, target_vals, target_dataset, type): + """ + Calculates the sample r (or rho) and p-value + + Given a primary trait and a target trait's sample values, + calculates either the pearson r or spearman rho and the p-value + using the corresponding scipy functions. + """ + + this_trait_vals = [] + shared_target_vals = [] + for i, sample in enumerate(target_dataset.group.samplelist): + if sample in this_trait.data: + this_sample_value = this_trait.data[sample].value + target_sample_value = target_vals[i] + this_trait_vals.append(this_sample_value) + shared_target_vals.append(target_sample_value) + + this_trait_vals, shared_target_vals, num_overlap = corr_result_helpers.normalize_values( + this_trait_vals, shared_target_vals) + + if type == 'pearson': + sample_r, sample_p = scipy.stats.pearsonr( + this_trait_vals, shared_target_vals) + else: + sample_r, sample_p = scipy.stats.spearmanr( + this_trait_vals, shared_target_vals) + + if num_overlap > 5: + if numpy.isnan(sample_r): + return None + else: + return [sample_r, sample_p, num_overlap] + + +def convert_to_mouse_gene_id(species=None, gene_id=None): + """If the species is rat or human, translate the gene_id to the mouse geneid + + If there is no input gene_id or there's no corresponding mouse gene_id, return None + + """ + if not gene_id: + return None + + mouse_gene_id = None + with database_connection(get_setting("SQL_URI")) as conn: + with conn.cursor() as cursor: + if species == 'mouse': + mouse_gene_id = gene_id + elif species == 'rat': + cursor.execute( + ("SELECT mouse FROM GeneIDXRef " + "WHERE rat=%s"), gene_id) + result = cursor.fetchone() + if result: + mouse_gene_id = result[0] + elif species == 'human': + cursor.execute( + "SELECT mouse FROM GeneIDXRef " + "WHERE human=%s", gene_id) + result = cursor.fetchone() + if result: + mouse_gene_id = result[0] + return mouse_gene_id + + +def init_corr_params(start_vars): + method = "pearson" + if 'method' in start_vars: + method = start_vars['method'] + + type = "sample" + if 'type' in start_vars: + type = start_vars['type'] + + return_count = 500 + if 'return_count' in start_vars: + assert(start_vars['return_count'].isdigit()) + return_count = int(start_vars['return_count']) + + corr_params = { + 'method': method, + 'type': type, + 'return_count': return_count + } + + return corr_params diff --git a/gn2/wqflask/api/gen_menu.py b/gn2/wqflask/api/gen_menu.py new file mode 100644 index 00000000..45d5739e --- /dev/null +++ b/gn2/wqflask/api/gen_menu.py @@ -0,0 +1,217 @@ +from gn3.db.species import get_all_species + +def gen_dropdown_json(conn): + """Generates and outputs (as json file) the data for the main dropdown menus on + the home page + """ + species = get_all_species(conn) + groups = get_groups(species, conn) + types = get_types(groups, conn) + datasets = get_datasets(types, conn) + return dict(species=species, + groups=groups, + types=types, + datasets=datasets) + + +def get_groups(species, conn): + """Build groups list""" + groups = {} + with conn.cursor() as cursor: + for species_name, _species_full_name in species: + groups[species_name] = [] + query = ("SELECT InbredSet.Name, InbredSet.FullName, " + "IFNULL(InbredSet.Family, 'None') " + "FROM InbredSet, Species WHERE Species.Name = '{}' " + "AND InbredSet.SpeciesId = Species.Id GROUP by " + "InbredSet.Name ORDER BY IFNULL(InbredSet.FamilyOrder, " + "InbredSet.FullName) ASC, IFNULL(InbredSet.Family, " + "InbredSet.FullName) ASC, InbredSet.FullName ASC, " + "InbredSet.MenuOrderId ASC").format(species_name) + cursor.execute(query) + results = cursor.fetchall() + for result in results: + family_name = "Family:" + str(result[2]) + groups[species_name].append( + [str(result[0]), str(result[1]), family_name]) + return groups + + +def get_types(groups, conn): + """Build types list""" + types = {} + + for species, group_dict in list(groups.items()): + types[species] = {} + for group_name, _group_full_name, _family_name in group_dict: + if phenotypes_exist(group_name, conn): + types[species][group_name] = [ + ("Phenotypes", "Traits and Cofactors", "Phenotypes")] + if genotypes_exist(group_name, conn): + if group_name in types[species]: + types[species][group_name] += [ + ("Genotypes", "DNA Markers and SNPs", "Genotypes")] + else: + types[species][group_name] = [ + ("Genotypes", "DNA Markers and SNPs", "Genotypes")] + if group_name in types[species]: + types_list = build_types(species, group_name, conn) + if len(types_list) > 0: + types[species][group_name] += types_list + else: + types_list = build_types(species, group_name, conn) + if len(types_list) > 0: + types[species][group_name] = types_list + else: + types[species].pop(group_name, None) + groups[species] = list( + group for group in groups[species] + if group[0] != group_name) + return types + + +def phenotypes_exist(group_name, conn): + results = [] + with conn.cursor() as cursor: + cursor.execute( + ("SELECT Name FROM PublishFreeze " + "WHERE PublishFreeze.Name = " + "'{}'").format(group_name + "Publish")) + results = cursor.fetchone() + return bool(results) + + +def genotypes_exist(group_name, conn): + with conn.cursor() as cursor: + cursor.execute( + ("SELECT Name FROM GenoFreeze " + + "WHERE GenoFreeze.Name = '{}'").format( + group_name + "Geno")) + results = cursor.fetchone() + return bool(results) + + +def build_types(species, group, conn): + """Fetches tissues + + Gets the tissues with data for this species/group + (all types except phenotype/genotype are tissues) + + """ + + query = ("SELECT DISTINCT Tissue.Name " + "FROM ProbeFreeze, ProbeSetFreeze, InbredSet, " + "Tissue, Species WHERE Species.Name = '{0}' " + "AND Species.Id = InbredSet.SpeciesId AND " + "InbredSet.Name = '{1}' AND ProbeFreeze.TissueId = " + "Tissue.Id AND ProbeFreeze.InbredSetId = InbredSet.Id " + "AND ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id " + "ORDER BY Tissue.Name").format(species, group) + + results = [] + with conn.cursor() as cursor: + cursor.execute(query) + for result in cursor.fetchall(): + if bool(result): + these_datasets = build_datasets(species, + group, result[0], conn) + if len(these_datasets) > 0: + results.append([str(result[0]), str(result[0]), + "Molecular Traits"]) + return results + + +def get_datasets(types, conn): + """Build datasets list""" + datasets = {} + for species, group_dict in list(types.items()): + datasets[species] = {} + for group, type_list in list(group_dict.items()): + datasets[species][group] = {} + for type_name in type_list: + these_datasets = build_datasets(species, group, + type_name[0], conn) + if bool(these_datasets): + datasets[species][group][type_name[0]] = these_datasets + + return datasets + + +def build_datasets(species, group, type_name, conn): + """Gets dataset names from database""" + dataset_text = dataset_value = None + datasets = [] + with conn.cursor() as cursor: + if type_name == "Phenotypes": + cursor.execute( + ("SELECT InfoFiles.GN_AccesionId, PublishFreeze.Name, " + "PublishFreeze.FullName FROM InfoFiles, PublishFreeze, " + "InbredSet WHERE InbredSet.Name = '{}' AND " + "PublishFreeze.InbredSetId = InbredSet.Id AND " + "InfoFiles.InfoPageName = PublishFreeze.Name " + "ORDER BY PublishFreeze.CreateTime ASC").format(group)) + results = cursor.fetchall() + if bool(results): + for result in results: + dataset_id = str(result[0]) + dataset_value = str(result[1]) + dataset_text = str(result[2]) + if group == 'MDP': + dataset_text = "Mouse Phenome Database" + + datasets.append([dataset_id, dataset_value, dataset_text]) + else: + cursor.execute( + ("SELECT PublishFreeze.Name, PublishFreeze.FullName " + "FROM PublishFreeze, InbredSet " + "WHERE InbredSet.Name = '{}' AND " + "PublishFreeze.InbredSetId = InbredSet.Id " + "ORDER BY PublishFreeze.CreateTime ASC") + .format(group)) + result = cursor.fetchone() + dataset_id = "None" + dataset_value = str(result[0]) + dataset_text = str(result[1]) + datasets.append([dataset_id, dataset_value, dataset_text]) + + elif type_name == "Genotypes": + cursor.execute( + ("SELECT InfoFiles.GN_AccesionId " + "FROM InfoFiles, GenoFreeze, InbredSet " + "WHERE InbredSet.Name = '{}' AND " + "GenoFreeze.InbredSetId = InbredSet.Id AND " + "InfoFiles.InfoPageName = GenoFreeze.ShortName " + "ORDER BY GenoFreeze.CreateTime " + "DESC").format(group)) + results = cursor.fetchone() + dataset_id = "None" + if bool(results): + dataset_id = str(results[0]) + + dataset_value = "%sGeno" % group + dataset_text = "%s Genotypes" % group + datasets.append([dataset_id, dataset_value, dataset_text]) + + else: # for mRNA expression/ProbeSet + cursor.execute( + ("SELECT ProbeSetFreeze.Id, ProbeSetFreeze.Name, " + "ProbeSetFreeze.FullName FROM ProbeSetFreeze, " + "ProbeFreeze, InbredSet, Tissue, Species WHERE " + "Species.Name = '{0}' AND Species.Id = " + "InbredSet.SpeciesId AND InbredSet.Name = '{1}' " + "AND ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id " + "AND Tissue.Name = '{2}' AND ProbeFreeze.TissueId = " + "Tissue.Id AND ProbeFreeze.InbredSetId = InbredSet.Id " + "AND ProbeSetFreeze.public > 0 " + "ORDER BY -ProbeSetFreeze.OrderList DESC, " + "ProbeSetFreeze.CreateTime " + "DESC").format(species, group, type_name)) + results = cursor.fetchall() + datasets = [] + for dataset_info in results: + this_dataset_info = [] + for info in dataset_info: + this_dataset_info.append(str(info)) + datasets.append(this_dataset_info) + + return datasets diff --git a/gn2/wqflask/api/jobs.py b/gn2/wqflask/api/jobs.py new file mode 100644 index 00000000..7a948e1a --- /dev/null +++ b/gn2/wqflask/api/jobs.py @@ -0,0 +1,54 @@ +import uuid +from datetime import datetime + +from redis import Redis +from pymonad.io import IO +from flask import Blueprint, render_template + +from gn2.jobs.jobs import job + +jobs = Blueprint("jobs", __name__) + +@jobs.route("/debug/<uuid:job_id>") +def debug_job(job_id: uuid.UUID): + """Display job data to assist in debugging.""" + from gn2.utility.tools import REDIS_URL # Avoids circular import error + + def __stream_to_lines__(stream): + removables = ( + "Set global log level to", "runserver.py: ******", + "APPLICATION_ROOT:", "DB_", "DEBUG:", "ELASTICSEARCH_", "ENV:", + "EXPLAIN_TEMPLATE_LOADING:", "GEMMA_", "GENENETWORK_FILES", + "GITHUB_", "GN2_", "GN3_", "GN_", "HOME:", "JSONIFY_", "JS_", + "JSON_", "LOG_", "MAX_", "ORCID_", "PERMANENT_", "PLINK_", + "PREFERRED_URL_SCHEME", "PRESERVE_CONTEXT_ON_EXCEPTION", + "PROPAGATE_EXCEPTIONS", "REAPER_COMMAND", "REDIS_URL", "SECRET_", + "SECURITY_", "SEND_FILE_MAX_AGE_DEFAULT", "SERVER_", "SESSION_", + "SMTP_", "SQL_", "TEMPLATES_", "TESTING:", "TMPDIR", "TRAP_", + "USE_", "WEBSERVER_") + return tuple(filter( + lambda line: not any(line.startswith(item) for item in removables), + stream.split("\n"))) + + def __fmt_datetime(val): + return datetime.strptime(val, "%Y-%m-%dT%H:%M:%S.%f").strftime( + "%A, %d %B %Y at %H:%M:%S.%f") + + def __render_debug_page__(job): + job_details = {key.replace("-", "_"): val for key,val in job.items()} + return render_template( + "jobs/debug.html", + **{ + **job_details, + "request_received_time": __fmt_datetime( + job_details["request_received_time"]), + "stderr": __stream_to_lines__(job_details["stderr"]), + "stdout": __stream_to_lines__(job_details["stdout"]) + }) + + with Redis.from_url(REDIS_URL, decode_responses=True) as rconn: + the_job = job(rconn, job_id) + + return the_job.maybe( + render_template("jobs/no-such-job.html", job_id=job_id), + lambda job: __render_debug_page__(job)) diff --git a/gn2/wqflask/api/mapping.py b/gn2/wqflask/api/mapping.py new file mode 100644 index 00000000..1e330963 --- /dev/null +++ b/gn2/wqflask/api/mapping.py @@ -0,0 +1,186 @@ +from gn2.base import data_set +from gn2.base.trait import create_trait, retrieve_sample_data + +from gn2.wqflask.marker_regression import gemma_mapping, rqtl_mapping +from gn2.wqflask.show_trait.show_trait import normf + +def do_mapping_for_api(start_vars): + if ('db' not in start_vars) or ("trait_id" not in start_vars): + raise ValueError("Mapping: db and trait_id are not in start_vars") + + dataset = data_set.create_dataset(dataset_name=start_vars['db']) + dataset.group.get_markers() + this_trait = create_trait(dataset=dataset, name=start_vars['trait_id']) + this_trait = retrieve_sample_data(this_trait, dataset) + + samples = [] + vals = [] + + mapping_params = initialize_parameters(start_vars, dataset, this_trait) + + genofile_samplelist = [] + if mapping_params.get('genofile'): + dataset.group.genofile = mapping_params['genofile'] + genofile_samplelist = get_genofile_samplelist(dataset) + + if (len(genofile_samplelist) > 0): + samplelist = genofile_samplelist + for sample in samplelist: + in_trait_data = False + for item in this_trait.data: + if this_trait.data[item].name == sample: + value = str(this_trait.data[item].value) + samples.append(item) + vals.append(value) + in_trait_data = True + break + if not in_trait_data: + vals.append("x") + else: + samplelist = dataset.group.samplelist + for sample in samplelist: + in_trait_data = False + for item in this_trait.data: + if this_trait.data[item].name == sample: + value = str(this_trait.data[item].value) + samples.append(item) + vals.append(value) + in_trait_data = True + break + if not in_trait_data: + vals.append("x") + + if mapping_params.get('transform') == "qnorm": + vals_minus_x = [float(val) for val in vals if val != "x"] + qnorm_vals = normf(vals_minus_x) + qnorm_vals_with_x = [] + counter = 0 + for val in vals: + if val == "x": + qnorm_vals_with_x.append("x") + else: + qnorm_vals_with_x.append(qnorm_vals[counter]) + counter += 1 + + vals = qnorm_vals_with_x + + # It seems to take an empty string as default. This should probably be changed. + covariates = "" + + if mapping_params.get('mapping_method') == "gemma": + header_row = ["name", "chr", "Mb", "lod_score", "p_value"] + # gemma_mapping returns both results and the filename for LOCO, so need to only grab the former for api + if mapping_params.get('use_loco') == "True": + result_markers = gemma_mapping.run_gemma( + this_trait, dataset, samples, vals, covariates, mapping_params['use_loco'], mapping_params['maf'])[0] + else: + result_markers = gemma_mapping.run_gemma( + this_trait, dataset, samples, vals, covariates, mapping_params['use_loco'], mapping_params['maf']) + elif mapping_params.get('mapping_method') == "rqtl": + header_row = ["name", "chr", "cM", "lod_score"] + if mapping_params['num_perm'] > 0: + _sperm_output, _suggestive, _significant, result_markers = rqtl_mapping.run_rqtl(this_trait.name, vals, samples, dataset, None, "Mb", mapping_params['rqtl_model'], + mapping_params['rqtl_method'], mapping_params['num_perm'], None, + mapping_params['do_control'], mapping_params['control_marker'], + mapping_params['manhattan_plot'], None) + else: + result_markers = rqtl_mapping.run_rqtl(this_trait.name, vals, samples, dataset, None, "Mb", mapping_params['rqtl_model'], + mapping_params['rqtl_method'], mapping_params['num_perm'], None, + mapping_params['do_control'], mapping_params['control_marker'], + mapping_params['manhattan_plot'], None) + + if mapping_params.get('limit_to'): + result_markers = result_markers[:mapping_params['limit_to']] + + if mapping_params.get('format') == "csv": + output_rows = [] + output_rows.append(header_row) + for marker in result_markers: + this_row = [marker[header] for header in header_row] + output_rows.append(this_row) + + return output_rows, mapping_params['format'] + elif mapping_params['format'] == "json": + return result_markers, mapping_params['format'] + else: + return result_markers, None + + +def initialize_parameters(start_vars, dataset, this_trait): + mapping_params = {} + + mapping_params['format'] = "json" + if 'format' in start_vars: + mapping_params['format'] = start_vars['format'] + + mapping_params['limit_to'] = False + if 'limit_to' in start_vars: + if start_vars['limit_to'].isdigit(): + mapping_params['limit_to'] = int(start_vars['limit_to']) + + mapping_params['mapping_method'] = "gemma" + if 'method' in start_vars: + mapping_params['mapping_method'] = start_vars['method'] + + if mapping_params['mapping_method'] == "rqtl": + mapping_params['rqtl_method'] = "hk" + mapping_params['rqtl_model'] = "normal" + mapping_params['do_control'] = False + mapping_params['control_marker'] = "" + mapping_params['manhattan_plot'] = True + mapping_params['pair_scan'] = False + if 'rqtl_method' in start_vars: + mapping_params['rqtl_method'] = start_vars['rqtl_method'] + if 'rqtl_model' in start_vars: + mapping_params['rqtl_model'] = start_vars['rqtl_model'] + if 'control_marker' in start_vars: + mapping_params['control_marker'] = start_vars['control_marker'] + mapping_params['do_control'] = True + if 'pair_scan' in start_vars: + if start_vars['pair_scan'].lower() == "true": + mapping_params['pair_scan'] = True + + if 'interval_mapping' in start_vars: + if start_vars['interval_mapping'].lower() == "true": + mapping_params['manhattan_plot'] = False + elif 'manhattan_plot' in start_vars: + if start_vars['manhattan_plot'].lower() != "true": + mapping_params['manhattan_plot'] = False + + mapping_params['maf'] = 0.01 + if 'maf' in start_vars: + mapping_params['maf'] = start_vars['maf'] # Minor allele frequency + + mapping_params['use_loco'] = True + if 'use_loco' in start_vars: + if (start_vars['use_loco'].lower() == "false") or (start_vars['use_loco'].lower() == "no"): + mapping_params['use_loco'] = False + + mapping_params['num_perm'] = 0 + mapping_params['perm_check'] = False + if 'num_perm' in start_vars: + try: + mapping_params['num_perm'] = int(start_vars['num_perm']) + mapping_params['perm_check'] = "ON" + except: + mapping_params['perm_check'] = False + + mapping_params['transform'] = False + if 'transform' in start_vars: + mapping_params['transform'] = start_vars['transform'] + + mapping_params['genofile'] = False + if 'genofile' in start_vars: + mapping_params['genofile'] = start_vars['genofile'] + + return mapping_params + +def get_genofile_samplelist(dataset): + genofile_samplelist = [] + + genofile_json = dataset.group.get_genofiles() + for genofile in genofile_json: + if genofile['location'] == dataset.group.genofile and 'sample_list' in genofile: + genofile_samplelist = genofile['sample_list'] + + return genofile_samplelist diff --git a/gn2/wqflask/api/markdown.py b/gn2/wqflask/api/markdown.py new file mode 100644 index 00000000..580b9ac0 --- /dev/null +++ b/gn2/wqflask/api/markdown.py @@ -0,0 +1,186 @@ +"""Markdown routes + +Render pages from github, or if they are unavailable, look for it else where +""" + +import requests +import markdown +import os +import sys + +from bs4 import BeautifulSoup # type: ignore + +from flask import send_from_directory +from flask import Blueprint +from flask import render_template + +from typing import Dict +from typing import List + +glossary_blueprint = Blueprint('glossary_blueprint', __name__) +references_blueprint = Blueprint("references_blueprint", __name__) +environments_blueprint = Blueprint("environments_blueprint", __name__) +links_blueprint = Blueprint("links_blueprint", __name__) +policies_blueprint = Blueprint("policies_blueprint", __name__) +facilities_blueprint = Blueprint("facilities_blueprint", __name__) +news_blueprint = Blueprint("news_blueprint", __name__) + +blogs_blueprint = Blueprint("blogs_blueprint", __name__) + + +def render_markdown(file_name, is_remote_file=True): + """Try to fetch the file name from Github and if that fails, try to +look for it inside the file system """ + github_url = ("https://raw.githubusercontent.com/" + "genenetwork/gn-docs/master/") + + if not is_remote_file: + text = "" + with open(file_name, "r", encoding="utf-8") as input_file: + text = input_file.read() + return markdown.markdown(text, + extensions=['tables']) + + md_content = requests.get(f"{github_url}{file_name}") + + if md_content.status_code == 200: + return markdown.markdown(md_content.content.decode("utf-8"), + extensions=['tables']) + + return (f"\nContent for {file_name} not available. " + "Please check " + "(here to see where content exists)" + "[https://github.com/genenetwork/gn-docs]. " + "Please reach out to the gn2 team to have a look at this") + + +def get_file_from_python_search_path(pathname_suffix): + cands = [os.path.join(d, pathname_suffix) for d in sys.path] + try: + return list(filter(os.path.exists, cands))[0] + except IndexError: + return None + + +def get_blogs(user: str = "genenetwork", + repo_name: str = "gn-docs") -> dict: + + blogs: Dict[int, List] = {} + github_url = f"https://api.github.com/repos/{user}/{repo_name}/git/trees/master?recursive=1" + + repo_tree = requests.get(github_url).json()["tree"] + + for data in repo_tree: + path_name = data["path"] + if path_name.startswith("blog") and path_name.endswith(".md"): + split_path = path_name.split("/")[1:] + try: + year, title, file_name = split_path + except Exception as e: + year, file_name = split_path + title = "" + + subtitle = os.path.splitext(file_name)[0] + + blog = { + "title": title, + "subtitle": subtitle, + "full_path": path_name + } + + if year in blogs: + blogs[int(year)].append(blog) + else: + blogs[int(year)] = [blog] + + return dict(sorted(blogs.items(), key=lambda x: x[0], reverse=True)) + + +@glossary_blueprint.route('/') +def glossary(): + return render_template( + "glossary.html", + rendered_markdown=render_markdown("general/glossary/glossary.md")), 200 + + +@references_blueprint.route('/') +def references(): + return render_template( + "references.html", + rendered_markdown=render_markdown("general/references/references.md")), 200 + + +@news_blueprint.route('/') +def news(): + return render_template( + "news.html", + rendered_markdown=render_markdown("general/news/news.md")), 200 + + +@environments_blueprint.route("/") +def environments(): + + md_file = get_file_from_python_search_path("wqflask/DEPENDENCIES.md") + svg_file = get_file_from_python_search_path( + "wqflask/dependency-graph.html") + svg_data = None + if svg_file: + with open(svg_file, 'r') as f: + svg_data = "".join( + BeautifulSoup(f.read(), + 'lxml').body.script.contents) + + if md_file is not None: + return ( + render_template("environment.html", + svg_data=svg_data, + rendered_markdown=render_markdown( + md_file, + is_remote_file=False)), + 200 + ) + # Fallback: Fetch file from server + return (render_template( + "environment.html", + svg_data=None, + rendered_markdown=render_markdown( + "general/environments/environments.md")), + 200) + + +@environments_blueprint.route('/svg-dependency-graph') +def svg_graph(): + directory, file_name, _ = get_file_from_python_search_path( + "wqflask/dependency-graph.svg").partition("dependency-graph.svg") + return send_from_directory(directory, file_name) + + +@links_blueprint.route("/") +def links(): + return render_template( + "links.html", + rendered_markdown=render_markdown("general/links/links.md")), 200 + + +@policies_blueprint.route("/") +def policies(): + return render_template( + "policies.html", + rendered_markdown=render_markdown("general/policies/policies.md")), 200 + + +@facilities_blueprint.route("/") +def facilities(): + return render_template("facilities.html", rendered_markdown=render_markdown("general/help/facilities.md")), 200 + + +@blogs_blueprint.route("/<path:blog_path>") +def display_blog(blog_path): + return render_template("blogs.html", rendered_markdown=render_markdown(blog_path)) + + +@blogs_blueprint.route("/") +def blogs_list(): + blogs = get_blogs() + + return render_template("blogs_list.html", blogs=blogs) diff --git a/gn2/wqflask/api/router.py b/gn2/wqflask/api/router.py new file mode 100644 index 00000000..bcd08e8d --- /dev/null +++ b/gn2/wqflask/api/router.py @@ -0,0 +1,1037 @@ +# GN2 API + +import os +import io +import csv +import json +import datetime +import requests + +from zipfile import ZipFile, ZIP_DEFLATED + + +import flask +from flask import current_app +from gn2.wqflask.database import database_connection +from flask import request +from flask import make_response +from flask import send_file + +from gn2.wqflask import app + +from gn2.wqflask.api import correlation, mapping, gen_menu + +from gn2.utility.tools import flat_files, get_setting + +from gn2.wqflask.database import database_connection + + +version = "pre1" + + +@app.route("/api/v_{}/".format(version)) +def hello_world(): + return flask.jsonify({"hello": "world"}) + + +@app.route("/api/v_{}/species".format(version)) +def get_species_list(): + species_list = [] + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + cursor.execute( + "SELECT SpeciesId, Name, FullName, TaxonomyId FROM Species" + ) + for species in cursor.fetchall(): + species_dict = { + "Id": species[0], + "Name": species[1], + "FullName": species[2], + "TaxonomyId": species[3] + } + species_list.append(species_dict) + return flask.jsonify(species_list) + + +@app.route("/api/v_{}/species/<path:species_name>".format(version)) +@app.route("/api/v_{}/species/<path:species_name>.<path:file_format>".format(version)) +def get_species_info(species_name, file_format="json"): + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + cursor.execute( + "SELECT SpeciesId, Name, FullName, TaxonomyId " + "FROM Species WHERE (Name=%s OR FullName=%s " + "OR SpeciesName=%s)", ((species_name,)*3)) + _species = cursor.fetchone() + species_dict = { + "Id": _species[0], + "Name": _species[1], + "FullName": _species[2], + "TaxonomyId": _species[3] + } + + return flask.jsonify(species_dict) + + +@app.route("/api/v_{}/groups".format(version)) +@app.route("/api/v_{}/groups/<path:species_name>".format(version)) +def get_groups_list(species_name=None): + _groups = () + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + if species_name: + cursor.execute( + "SELECT InbredSet.InbredSetId, " + "InbredSet.SpeciesId, InbredSet.InbredSetName, " + "InbredSet.Name, InbredSet.FullName, " + "InbredSet.public, IFNULL(InbredSet.MappingMethodId, " + "'None'), IFNULL(InbredSet.GeneticType, 'None') " + "FROM InbredSet, Species WHERE " + "InbredSet.SpeciesId = Species.Id AND " + "(Species.Name = %s OR Species.FullName=%s " + "OR Species.SpeciesName=%s)", ((species_name,) * 3) + ) + else: + cursor.execute( + "SELECT InbredSet.InbredSetId, " + "InbredSet.SpeciesId, InbredSet.InbredSetName, " + "InbredSet.Name, InbredSet.FullName, " + "InbredSet.public, IFNULL(InbredSet.MappingMethodId, " + "'None'), IFNULL(InbredSet.GeneticType, 'None') " + "FROM InbredSet" + ) + _groups = cursor.fetchall() + + if _groups: + groups_list = [] + for group in _groups: + group_dict = { + "Id": group[0], + "SpeciesId": group[1], + "DisplayName": group[2], + "Name": group[3], + "FullName": group[4], + "public": group[5], + "MappingMethodId": group[6], + "GeneticType": group[7] + } + groups_list.append(group_dict) + return flask.jsonify(groups_list) + return return_error(code=204, source=request.url_rule.rule, title="No Results", details="") + + +@app.route("/api/v_{}/group/<path:group_name>".format(version)) +@app.route("/api/v_{}/group/<path:group_name>.<path:file_format>".format(version)) +@app.route("/api/v_{}/group/<path:species_name>/<path:group_name>".format(version)) +@app.route("/api/v_{}/group/<path:species_name>/<path:group_name>.<path:file_format>".format(version)) +def get_group_info(group_name, species_name=None, file_format="json"): + group = tuple() + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + if species_name: + cursor.execute( + "SELECT InbredSet.InbredSetId, InbredSet.SpeciesId, " + "InbredSet.InbredSetName, InbredSet.Name, " + "InbredSet.FullName, InbredSet.public, " + "IFNULL(InbredSet.MappingMethodId, 'None'), " + "IFNULL(InbredSet.GeneticType, 'None') " + "FROM InbredSet, Species WHERE " + "InbredSet.SpeciesId = Species.Id " + "AND (InbredSet.InbredSetName = %s OR " + "InbredSet.Name = %s OR InbredSet.FullName = %s) " + "AND (Species.Name = %s OR " + "Species.FullName = %s OR Species.SpeciesName = %s)", + (*(group_name,)*3, *(species_name,)*3) + ) + else: + cursor.execute( + "SELECT InbredSet.InbredSetId, InbredSet.SpeciesId, " + "InbredSet.InbredSetName, InbredSet.Name, " + "InbredSet.FullName, InbredSet.public, " + "IFNULL(InbredSet.MappingMethodId, 'None'), " + "IFNULL(InbredSet.GeneticType, 'None') " + "FROM InbredSet WHERE " + "(InbredSet.InbredSetName = %s OR " + "InbredSet.Name = %s OR " + "InbredSet.FullName = %s)", + ((group_name,)*3) + ) + group = cursor.fetchone() + + if group: + group_dict = { + "Id": group[0], + "SpeciesId": group[1], + "DisplayName": group[2], + "Name": group[3], + "FullName": group[4], + "public": group[5], + "MappingMethodId": group[6], + "GeneticType": group[7] + } + + return flask.jsonify(group_dict) + else: + return return_error(code=204, source=request.url_rule.rule, title="No Results", details="") + + +@app.route("/api/v_{}/datasets/<path:group_name>".format(version)) +@app.route("/api/v_{}/datasets/<path:species_name>/<path:group_name>".format(version)) +def get_datasets_for_group(group_name, species_name=None): + _datasets = () + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + if species_name: + cursor.execute( + "SELECT ProbeSetFreeze.Id, ProbeSetFreeze.ProbeFreezeId, " + "ProbeSetFreeze.AvgID, ProbeSetFreeze.Name, " + "ProbeSetFreeze.Name2, " + "ProbeSetFreeze.FullName, ProbeSetFreeze.ShortName, " + "ProbeSetFreeze.CreateTime, ProbeSetFreeze.public, " + "ProbeSetFreeze.confidentiality, ProbeSetFreeze.DataScale " + "FROM ProbeSetFreeze, ProbeFreeze, InbredSet, Species " + "WHERE ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id " + "AND ProbeFreeze.InbredSetId = InbredSet.Id " + "AND (InbredSet.Name = %s OR " + "InbredSet.InbredSetName = %s OR " + "InbredSet.FullName = %s) AND " + "InbredSet.SpeciesId = Species.Id AND " + "(Species.SpeciesName = %s OR " + "Species.MenuName = %s OR Species.FullName = %s);", + (*(group_name,)*3, *(species_name)*3) + ) + else: + cursor.execute( + "SELECT ProbeSetFreeze.Id, ProbeSetFreeze.ProbeFreezeId, " + "ProbeSetFreeze.AvgID, ProbeSetFreeze.Name, " + "ProbeSetFreeze.Name2, ProbeSetFreeze.FullName, " + "ProbeSetFreeze.ShortName, ProbeSetFreeze.CreateTime, " + "ProbeSetFreeze.public, ProbeSetFreeze.confidentiality, " + "ProbeSetFreeze.DataScale FROM ProbeSetFreeze, " + "ProbeFreeze, InbredSet WHERE " + "ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id " + "AND ProbeFreeze.InbredSetId = InbredSet.Id " + "AND (InbredSet.Name = %s OR " + "InbredSet.InbredSetName = %s OR " + "InbredSet.FullName = %s)", + ((group_name,) * 3) + ) + _datasets = cursor.fetchall() + + if _datasets: + datasets_list = [] + for dataset in _datasets: + dataset_dict = { + "Id": dataset[0], + "ProbeFreezeId": dataset[1], + "AvgID": dataset[2], + "Short_Abbreviation": dataset[3], + "Long_Abbreviation": dataset[4], + "FullName": dataset[5], + "ShortName": dataset[6], + "CreateTime": dataset[7], + "public": dataset[8], + "confidentiality": dataset[9], + "DataScale": dataset[10] + } + datasets_list.append(dataset_dict) + + return flask.jsonify(datasets_list) + else: + return return_error(code=204, source=request.url_rule.rule, title="No Results", details="") + + +@app.route("/api/v_{}/dataset/<path:dataset_name>".format(version)) +@app.route("/api/v_{}/dataset/<path:dataset_name>.<path:file_format>".format(version)) +@app.route("/api/v_{}/dataset/<path:group_name>/<path:dataset_name>".format(version)) +@app.route("/api/v_{}/dataset/<path:group_name>/<path:dataset_name>.<path:file_format>".format(version)) +def get_dataset_info(dataset_name, group_name=None, file_format="json"): + # ZS: First get ProbeSet (mRNA expression) datasets and then get Phenotype datasets + + # ZS: I figure I might as well return a list if there are multiple + # matches, though I don"t know if this will actually happen in + # practice + datasets_list, dataset_dict = [], {} + probeset_query = """ + SELECT ProbeSetFreeze.Id, ProbeSetFreeze.Name, ProbeSetFreeze.FullName, + ProbeSetFreeze.ShortName, ProbeSetFreeze.DataScale, ProbeFreeze.TissueId, + Tissue.Name, ProbeSetFreeze.public, ProbeSetFreeze.confidentiality + FROM ProbeSetFreeze, ProbeFreeze, Tissue + """ + + where_statement = """ + WHERE ProbeSetFreeze.ProbeFreezeId = ProbeFreeze.Id AND + ProbeFreeze.TissueId = Tissue.Id AND + ProbeSetFreeze.public > 0 AND + ProbeSetFreeze.confidentiality < 1 AND + """ + if dataset_name.isdigit(): + where_statement += """ + ProbeSetFreeze.Id = "{}" + """.format(dataset_name) + else: + where_statement += """ + (ProbeSetFreeze.Name = "{0}" OR ProbeSetFreeze.Name2 = "{0}" OR + ProbeSetFreeze.FullName = "{0}" OR ProbeSetFreeze.ShortName = "{0}") + """.format(dataset_name) + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + cursor.execute(f"{probeset_query}{where_statement}") + + if dataset := cursor.fetchone(): + dataset_dict = { + "dataset_type": "mRNA expression", + "id": dataset[0], + "name": dataset[1], + "full_name": dataset[2], + "short_name": dataset[3], + "data_scale": dataset[4], + "tissue_id": dataset[5], + "tissue": dataset[6], + "public": dataset[7], + "confidential": dataset[8] + } + + datasets_list.append(dataset_dict) + + if group_name: + cursor.execute( + "SELECT PublishXRef.Id, " + "Phenotype.Post_publication_abbreviation, " + "Phenotype.Post_publication_description, " + "Phenotype.Pre_publication_abbreviation, " + "Phenotype.Pre_publication_description, " + "Publication.PubMed_ID, Publication.Title, " + "Publication.Year FROM PublishXRef, Phenotype, " + "Publication, InbredSet, PublishFreeze WHERE " + "PublishXRef.InbredSetId = InbredSet.Id " + "AND PublishXRef.PhenotypeId = Phenotype.Id " + "AND PublishXRef.PublicationId = Publication.Id " + "AND PublishFreeze.InbredSetId = InbredSet.Id " + "AND PublishFreeze.public > 0 AND " + "PublishFreeze.confidentiality < 1 " + "AND InbredSet.Name = %s AND PublishXRef.Id = %s", + (group_name, dataset_name,) + ) + + if dataset := cursor.fetchone(): + if dataset[5]: + dataset_dict = { + "dataset_type": "phenotype", + "id": dataset[0], + "name": dataset[1], + "description": dataset[2], + "pubmed_id": dataset[5], + "title": dataset[6], + "year": dataset[7] + } + elif dataset[4]: + dataset_dict = { + "dataset_type": "phenotype", + "id": dataset[0], + "name": dataset[3], + "description": dataset[4] + } + else: + dataset_dict = { + "dataset_type": "phenotype", + "id": dataset[0] + } + + datasets_list.append(dataset_dict) + + if len(datasets_list) > 1: + return flask.jsonify(datasets_list) + elif len(datasets_list) == 1: + return flask.jsonify(dataset_dict) + else: + return return_error(code=204, source=request.url_rule.rule, title="No Results", details="") + + +@app.route("/api/v_{}/traits/<path:dataset_name>".format(version), methods=("GET",)) +@app.route("/api/v_{}/traits/<path:dataset_name>.<path:file_format>".format(version), methods=("GET",)) +def fetch_traits(dataset_name, file_format="json"): + trait_ids, trait_names, data_type, dataset_id = get_dataset_trait_ids( + dataset_name, request.args) + if ("ids_only" in request.args) and (len(trait_ids) > 0): + if file_format == "json": + filename = dataset_name + "_trait_ids.json" + return flask.jsonify(trait_ids) + else: + filename = dataset_name + "_trait_ids.csv" + + si = io.StringIO() + csv_writer = csv.writer(si) + csv_writer.writerows([[trait_id] for trait_id in trait_ids]) + output = make_response(si.getvalue()) + output.headers["Content-Disposition"] = "attachment; filename=" + filename + output.headers["Content-type"] = "text/csv" + return output + elif ("names_only" in request.args) and (len(trait_ids) > 0): + if file_format == "json": + filename = dataset_name + "_trait_names.json" + return flask.jsonify(trait_names) + else: + filename = dataset_name + "_trait_names.csv" + + si = io.StringIO() + csv_writer = csv.writer(si) + csv_writer.writerows([[trait_name] for trait_name in trait_names]) + output = make_response(si.getvalue()) + output.headers["Content-Disposition"] = "attachment; filename=" + filename + output.headers["Content-type"] = "text/csv" + return output + else: + if len(trait_ids) > 0: + if data_type == "ProbeSet": + query = """ + SELECT + ProbeSet.Id, ProbeSet.Name, ProbeSet.Symbol, ProbeSet.description, ProbeSet.Chr, ProbeSet.Mb, ProbeSet.alias, + ProbeSetXRef.mean, ProbeSetXRef.se, ProbeSetXRef.Locus, ProbeSetXRef.LRS, ProbeSetXRef.pValue, ProbeSetXRef.additive, ProbeSetXRef.h2 + FROM + ProbeSet, ProbeSetXRef, ProbeSetFreeze + WHERE + ProbeSetXRef.ProbeSetFreezeId = "{0}" AND + ProbeSetXRef.ProbeSetId = ProbeSet.Id AND + ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND + ProbeSetFreeze.public > 0 AND + ProbeSetFreeze.confidentiality < 1 + ORDER BY + ProbeSet.Id + """ + + field_list = ["Id", "Name", "Symbol", "Description", "Chr", "Mb", + "Aliases", "Mean", "SE", "Locus", "LRS", "P-Value", "Additive", "h2"] + elif data_type == "Geno": + query = """ + SELECT + Geno.Id, Geno.Name, Geno.Marker_Name, Geno.Chr, Geno.Mb, Geno.Sequence, Geno.Source + FROM + Geno, GenoXRef, GenoFreeze + WHERE + GenoXRef.GenoFreezeId = "{0}" AND + GenoXRef.GenoId = Geno.Id AND + GenoXRef.GenoFreezeId = GenoFreeze.Id AND + GenoFreeze.public > 0 AND + GenoFreeze.confidentiality < 1 + ORDER BY + Geno.Id + """ + + field_list = ["Id", "Name", "Marker_Name", + "Chr", "Mb", "Sequence", "Source"] + else: + query = """SELECT PublishXRef.Id, + Phenotype.`Original_description`, + Publication.`Authors`, + Publication.`Year`, + Publication.`PubMed_ID`, + PublishXRef.`mean`, + PublishXRef.`LRS`, + PublishXRef.`additive`, + PublishXRef.`Locus`, + Geno.`Chr`, Geno.`Mb` + FROM Species + INNER JOIN InbredSet ON InbredSet.`SpeciesId` = Species.`Id` + INNER JOIN PublishXRef ON PublishXRef.`InbredSetId` = InbredSet.`Id` + INNER JOIN PublishFreeze ON PublishFreeze.`InbredSetId` = InbredSet.`Id` + INNER JOIN Publication ON Publication.`Id` = PublishXRef.`PublicationId` + INNER JOIN Phenotype ON Phenotype.`Id` = PublishXRef.`PhenotypeId` + LEFT JOIN Geno ON PublishXRef.Locus = Geno.Name AND Geno.SpeciesId = Species.Id + WHERE + PublishXRef.InbredSetId = {0} AND + PublishFreeze.InbredSetId = PublishXRef.InbredSetId AND + PublishFreeze.public > 0 AND + PublishFreeze.confidentiality < 1 + ORDER BY + PublishXRef.Id""" + + field_list = ["Id", "Description", "Authors", "Year", "PubMedID", "Mean", + "LRS", "Additive", "Locus", "Chr", "Mb"] + + if 'limit_to' in request.args: + limit_number = request.args['limit_to'] + query += "LIMIT " + str(limit_number) + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + if file_format == "json": + filename = dataset_name + "_traits.json" + cursor.execute(query.format(dataset_id)) + result_list = [] + for result in cursor.fetchall(): + trait_dict = {} + for i, field in enumerate(field_list): + if result[i]: + trait_dict[field] = result[i] + result_list.append(trait_dict) + return flask.jsonify(result_list) + elif file_format == "csv": + filename = dataset_name + "_traits.csv" + + results_list = [] + header_list = [] + header_list += field_list + results_list.append(header_list) + cursor.execute(query.format(dataset_id)) + for result in cursor.fetchall(): + results_list.append(result) + + si = io.StringIO() + csv_writer = csv.writer(si) + csv_writer.writerows(results_list) + output = make_response(si.getvalue()) + output.headers["Content-Disposition"] = "attachment; filename=" + filename + output.headers["Content-type"] = "text/csv" + return output + else: + return return_error( + code=400, + source=request.url_rule.rule, + title="Invalid Output Format", + details="Current formats available are JSON and CSV, with CSV as default" + ) + else: + return return_error( + code=204, + source=request.url_rule.rule, + title="No Results", + details="") + + +@app.route("/api/v_{}/sample_data/<path:dataset_name>".format(version)) +@app.route("/api/v_{}/sample_data/<path:dataset_name>.<path:file_format>".format(version)) +def all_sample_data(dataset_name, file_format="csv"): + trait_ids, trait_names, data_type, dataset_id = get_dataset_trait_ids( + dataset_name, request.args) + + if len(trait_ids) > 0: + sample_list = get_samplelist(dataset_name) + + if data_type == "ProbeSet": + query = """ + SELECT + Strain.Name, Strain.Name2, ProbeSetData.value, ProbeSetData.Id, ProbeSetSE.error + FROM + (ProbeSetData, Strain, ProbeSetXRef, ProbeSetFreeze) + LEFT JOIN ProbeSetSE ON + (ProbeSetSE.DataId = ProbeSetData.Id AND ProbeSetSE.StrainId = ProbeSetData.StrainId) + WHERE + ProbeSetXRef.ProbeSetFreezeId = "{0}" AND + ProbeSetXRef.ProbeSetId = "{1}" AND + ProbeSetXRef.DataId = ProbeSetData.Id AND + ProbeSetData.StrainId = Strain.Id AND + ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND + ProbeSetFreeze.public > 0 AND + ProbeSetFreeze.confidentiality < 1 + ORDER BY + Strain.Name + """ + elif data_type == "Geno": + query = """ + SELECT + Strain.Name, Strain.Name2, GenoData.value, GenoData.Id, GenoSE.error + FROM + (GenoData, Strain, GenoXRef, GenoFreeze) + LEFT JOIN GenoSE ON + (GenoSE.DataId = GenoData.Id AND GenoSE.StrainId = GenoData.StrainId) + WHERE + GenoXRef.GenoFreezeId = "{0}" AND + GenoXRef.GenoId = "{1}" AND + GenoXRef.DataId = GenoData.Id AND + GenoData.StrainId = Strain.Id AND + GenoXRef.GenoFreezeId = GenoFreeze.Id AND + GenoFreeze.public > 0 AND + GenoFreeze.confidentiality < 1 + ORDER BY + Strain.Name + """ + else: + query = """ + SELECT + Strain.Name, Strain.Name2, PublishData.value, PublishData.Id, PublishSE.error, NStrain.count + FROM + (PublishData, Strain, PublishXRef, PublishFreeze) + LEFT JOIN PublishSE ON + (PublishSE.DataId = PublishData.Id AND PublishSE.StrainId = PublishData.StrainId) + LEFT JOIN NStrain ON + (NStrain.DataId = PublishData.Id AND + NStrain.StrainId = PublishData.StrainId) + WHERE + PublishXRef.InbredSetId = "{0}" AND + PublishXRef.PhenotypeId = "{1}" AND + PublishData.Id = PublishXRef.DataId AND + PublishData.StrainId = Strain.Id AND + PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND + PublishFreeze.public > 0 AND + PublishFreeze.confidentiality < 1 + ORDER BY + Strain.Name + """ + + if file_format == "csv": + filename = dataset_name + "_sample_data.csv" + + results_list = [] + header_list = [] + header_list.append("id") + header_list += sample_list + results_list.append(header_list) + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + for i, trait_id in enumerate(trait_ids): + line_list = [] + line_list.append(str(trait_names[i])) + cursor.execute(query.format(dataset_id, trait_id)) + results = cursor.fetchall() + results_dict = {} + for item in results: + results_dict[item[0]] = item[2] + for sample in sample_list: + if sample in results_dict: + line_list.append(results_dict[sample]) + else: + line_list.append("x") + results_list.append(line_list) + + results_list = list(map(list, zip(*results_list))) + + si = io.StringIO() + csv_writer = csv.writer(si) + csv_writer.writerows(results_list) + output = make_response(si.getvalue()) + output.headers["Content-Disposition"] = "attachment; filename=" + filename + output.headers["Content-type"] = "text/csv" + return output + else: + return return_error(code=415, source=request.url_rule.rule, title="Unsupported file format", details="") + else: + return return_error(code=204, source=request.url_rule.rule, title="No Results", details="") + + +@app.route("/api/v_{}/sample_data/<path:dataset_name>/<path:trait_name>".format(version)) +@app.route("/api/v_{}/sample_data/<path:dataset_name>/<path:trait_name>.<path:file_format>".format(version)) +def trait_sample_data(dataset_name, trait_name, file_format="json"): + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + cursor.execute( + "SELECT Strain.Name, Strain.Name2, " + "ProbeSetData.value, ProbeSetData.Id, " + "ProbeSetSE.error FROM (ProbeSetData, " + "ProbeSetFreeze, Strain, ProbeSet, " + "ProbeSetXRef) LEFT JOIN ProbeSetSE ON " + "(ProbeSetSE.DataId = ProbeSetData.Id AND " + "ProbeSetSE.StrainId = ProbeSetData.StrainId) " + "WHERE ProbeSet.Name = %s AND " + "ProbeSetXRef.ProbeSetId = ProbeSet.Id " + "AND ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id " + "AND ProbeSetFreeze.Name = %s AND " + "ProbeSetXRef.DataId = ProbeSetData.Id " + "AND ProbeSetData.StrainId = Strain.Id " + "ORDER BY Strain.Name", + (trait_name, dataset_name,) + ) + + sample_data = cursor.fetchall() + if len(sample_data) > 0: + sample_list = [] + for sample in sample_data: + sample_dict = { + "sample_name": sample[0], + "sample_name_2": sample[1], + "value": sample[2], + "data_id": sample[3], + } + if sample[4]: + sample_dict["se"] = sample[4] + sample_list.append(sample_dict) + + return flask.jsonify(sample_list) + else: + if not dataset_name.isdigit(): + group_id = get_group_id(dataset_name) + if group_id: + dataset_or_group = group_id + else: + dataset_or_group = dataset_name + else: + dataset_or_group = dataset_name + + cursor.execute( + "SELECT DISTINCT Strain.Name, Strain.Name2, " + "PublishData.value, PublishData.Id, PublishSE.error, " + "NStrain.count FROM (PublishData, Strain, " + "PublishXRef, PublishFreeze) LEFT JOIN " + "PublishSE ON (PublishSE.DataId = PublishData.Id " + "AND PublishSE.StrainId = PublishData.StrainId) " + "LEFT JOIN NStrain ON " + "(NStrain.DataId = PublishData.Id AND " + "NStrain.StrainId = PublishData.StrainId) " + "WHERE PublishXRef.InbredSetId = PublishFreeze.InbredSetId " + "AND PublishData.Id = PublishXRef.DataId AND " + "PublishXRef.Id = %s AND (PublishFreeze.Id = %s " + "OR PublishFreeze.Name = %s OR " + "PublishFreeze.ShortName = %s OR " + "PublishXRef.InbredSetId = %s) AND " + "PublishData.StrainId = Strain.Id " + "ORDER BY Strain.Name", + (trait_name, *(dataset_or_group,)*4) + ) + if len(sample_data := cursor.fetchall()) > 0: + sample_list = [] + for sample in sample_data: + sample_dict = { + "sample_name": sample[0], + "sample_name_2": sample[1], + "value": sample[2], + "data_id": sample[3] + } + if sample[4]: + sample_dict["se"] = sample[4] + if sample[5]: + sample_dict["n_cases"] = sample[5] + sample_list.append(sample_dict) + + return flask.jsonify(sample_list) + else: + return return_error(code=204, source=request.url_rule.rule, title="No Results", details="") + + +@app.route("/api/v_{}/trait/<path:dataset_name>/<path:trait_name>".format(version)) +@app.route("/api/v_{}/trait/<path:dataset_name>/<path:trait_name>.<path:file_format>".format(version)) +@app.route("/api/v_{}/trait_info/<path:dataset_name>/<path:trait_name>".format(version)) +@app.route("/api/v_{}/trait_info/<path:dataset_name>/<path:trait_name>.<path:file_format>".format(version)) +def get_trait_info(dataset_name, trait_name, file_format="json"): + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + cursor.execute( + "SELECT ProbeSet.Id, ProbeSet.Name, ProbeSet.Symbol, " + "ProbeSet.description, ProbeSet.Chr, ProbeSet.Mb, " + "ProbeSet.alias, ProbeSetXRef.mean, ProbeSetXRef.se, " + "ProbeSetXRef.Locus, ProbeSetXRef.LRS, " + "ProbeSetXRef.pValue, ProbeSetXRef.additive " + "FROM ProbeSet, ProbeSetXRef, ProbeSetFreeze " + "WHERE ProbeSet.Name = %s AND " + "ProbeSetXRef.ProbeSetId = ProbeSet.Id AND " + "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id " + "AND ProbeSetFreeze.Name = %s", + (trait_name, dataset_name,) + ) + if trait_info := cursor.fetchone(): + trait_dict = { + "id": trait_info[0], + "name": trait_info[1], + "symbol": trait_info[2], + "description": trait_info[3], + "chr": trait_info[4], + "mb": trait_info[5], + "alias": trait_info[6], + "mean": trait_info[7], + "se": trait_info[8], + "locus": trait_info[9], + "lrs": trait_info[10], + "p_value": trait_info[11], + "additive": trait_info[12] + } + + return flask.jsonify(trait_dict) + else: + # ZS: Check if the user input the dataset_name as BXDPublish, etc (which is always going to be the group name + "Publish" + if "Publish" in dataset_name: + dataset_name = dataset_name.replace("Publish", "") + + group_id = get_group_id(dataset_name) + cursor.execute( + "SELECT PublishXRef.PhenotypeId, " + "PublishXRef.Locus, PublishXRef.LRS, " + "PublishXRef.additive FROM " + "PublishXRef WHERE " + "PublishXRef.Id = %s AND " + "PublishXRef.InbredSetId = %s", + (trait_name, group_id,) + ) + if trait_info := cursor.fetchone(): + trait_dict = { + "id": trait_info[0], + "locus": trait_info[1], + "lrs": trait_info[2], + "additive": trait_info[3] + } + + return flask.jsonify(trait_dict) + else: + return return_error(code=204, source=request.url_rule.rule, title="No Results", details="") + + +@app.route("/api/v_{}/correlation".format(version), methods=("GET",)) +def get_corr_results(): + results = correlation.do_correlation(request.args) + + if len(results) > 0: + # ZS: I think flask.jsonify expects a dict/list instead of JSON + return flask.jsonify(results) + else: + return return_error(code=204, source=request.url_rule.rule, title="No Results", details="") + + +@app.route("/api/v_{}/mapping".format(version), methods=("GET",)) +def get_mapping_results(): + results, format = mapping.do_mapping_for_api(request.args) + + if len(results) > 0: + if format == "csv": + filename = "mapping_" + datetime.datetime.utcnow().strftime("%b_%d_%Y_%I:%M%p") + ".csv" + + si = io.StringIO() + csv_writer = csv.writer(si) + csv_writer.writerows(results) + output = make_response(si.getvalue()) + output.headers["Content-Disposition"] = "attachment; filename=" + filename + output.headers["Content-type"] = "text/csv" + + return output + elif format == "json": + return flask.jsonify(results) + else: + return return_error(code=415, source=request.url_rule.rule, title="Unsupported Format", details="") + else: + return return_error(code=204, source=request.url_rule.rule, title="No Results", details="") + + +@app.route("/api/v_{}/genotypes/view/<string:group_name>".format(version)) +def view_genotype_files(group_name): + if os.path.isfile("{0}/{1}.json".format(flat_files("genotype"), group_name)): + with open("{0}/{1}.json".format(flat_files("genotype"), group_name)) as geno_json: + return flask.jsonify(json.load(geno_json)) + + +@app.route("/api/v_{}/genotypes/<string:file_format>/<string:group_name>/<string:dataset_name>.zip".format(version)) +@app.route("/api/v_{}/genotypes/<string:file_format>/<string:group_name>/<string:dataset_name>".format(version)) +@app.route("/api/v_{}/genotypes/<string:file_format>/<string:group_name>.zip".format(version)) +@app.route("/api/v_{}/genotypes/<string:file_format>/<string:group_name>".format(version)) +@app.route("/api/v_{}/genotypes/<string:group_name>.<string:file_format>".format(version)) +def get_genotypes(group_name, file_format="csv", dataset_name=None): + limit_num = None + if 'limit_to' in request.args: + if request.args['limit_to'].isdigit(): + limit_num = int(request.args['limit_to']) + + si = io.StringIO() + if file_format == "csv" or file_format == "geno": + filename = group_name + ".geno" + + if os.path.isfile("{0}/{1}.geno".format(flat_files("genotype"), group_name)): + output_lines = [] + with open("{0}/{1}.geno".format(flat_files("genotype"), group_name)) as genofile: + i = 0 + for line in genofile: + if line[0] == "#" or line[0] == "@": + output_lines.append([line.strip()]) + else: + if limit_num and i >= limit_num: + break + output_lines.append(line.split()) + i += 1 + + csv_writer = csv.writer( + si, delimiter="\t", escapechar="\\", quoting=csv.QUOTE_NONE) + else: + return return_error(code=204, source=request.url_rule.rule, title="No Results", details="") + elif file_format == "rqtl2": + memory_file = io.BytesIO() + if dataset_name: + filename = dataset_name + else: + filename = group_name + + if os.path.isfile("{0}/{1}_geno.csv".format(flat_files("genotype/rqtl2"), group_name)): + yaml_file = json.load( + open("{0}/{1}.json".format(flat_files("genotype/rqtl2"), group_name))) + yaml_file["geno"] = filename + "_geno.csv" + yaml_file["gmap"] = filename + "_gmap.csv" + yaml_file["pheno"] = filename + "_pheno.csv" + config_file = [filename + ".json", json.dumps(yaml_file)] + #config_file = [filename + ".yaml", open("{0}/{1}.yaml".format(flat_files("genotype/rqtl2"), group_name))] + geno_file = [filename + "_geno.csv", + open("{0}/{1}_geno.csv".format(flat_files("genotype/rqtl2"), group_name))] + gmap_file = [filename + "_gmap.csv", + open("{0}/{1}_gmap.csv".format(flat_files("genotype/rqtl2"), group_name))] + if dataset_name: + phenotypes = requests.get( + "http://gn2.genenetwork.org/api/v_pre1/sample_data/" + dataset_name) + else: + phenotypes = requests.get( + "http://gn2.genenetwork.org/api/v_pre1/sample_data/" + group_name + "Publish") + + with ZipFile(memory_file, 'w', compression=ZIP_DEFLATED) as zf: + zf.writestr(config_file[0], config_file[1]) + for this_file in [geno_file, gmap_file]: + zf.writestr(this_file[0], this_file[1].read()) + zf.writestr(filename + "_pheno.csv", phenotypes.content) + + memory_file.seek(0) + + return send_file(memory_file, attachment_filename=filename + ".zip", as_attachment=True) + else: + return return_error(code=204, source=request.url_rule.rule, title="No Results", details="") + else: + filename = group_name + ".bimbam" + + if os.path.isfile("{0}/{1}.geno".format(flat_files("genotype"), group_name)): + output_lines = [] + with open("{0}/{1}_geno.txt".format(flat_files("genotype/bimbam"), group_name)) as genofile: + i = 0 + for line in genofile: + if limit_num and i >= limit_num: + break + output_lines.append([line.strip() + for line in line.split(",")]) + i += 1 + + csv_writer = csv.writer(si, delimiter=",") + else: + return return_error(code=204, source=request.url_rule.rule, title="No Results", details="") + + csv_writer.writerows(output_lines) + output = make_response(si.getvalue()) + output.headers["Content-Disposition"] = "attachment; filename=" + filename + output.headers["Content-type"] = "text/csv" + + return output + + +@app.route("/api/v_{}/gen_dropdown".format(version), methods=("GET",)) +def gen_dropdown_menu(): + with database_connection(get_setting("SQL_URI")) as conn: + results = gen_menu.gen_dropdown_json(conn) + + if len(results) > 0: + return flask.jsonify(results) + else: + return return_error(code=500, source=request.url_rule.rule, title="Some error occurred", details="") + + +def return_error(code, source, title, details): + json_ob = {"errors": [ + { + "status": code, + "source": {"pointer": source}, + "title": title, + "detail": details + } + ]} + + return flask.jsonify(json_ob) + + +def get_dataset_trait_ids(dataset_name, start_vars): + + if 'limit_to' in start_vars: + limit_string = "LIMIT " + str(start_vars['limit_to']) + else: + limit_string = "" + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + if "Geno" in dataset_name: + data_type = "Geno" # ZS: Need to pass back the dataset type + cursor.execute( + "SELECT GenoXRef.GenoId, Geno.Name, " + "GenoXRef.GenoFreezeId FROM Geno, " + "GenoXRef, GenoFreeze WHERE " + "Geno.Id = GenoXRef.GenoId AND " + "GenoXRef.GenoFreezeId = GenoFreeze.Id " + f"AND GenoFreeze.Name = %s {limit_string}", + (dataset_name,)) + + results = cursor.fetchall() + + trait_ids = [result[0] for result in results] + trait_names = [result[1] for result in results] + dataset_id = results[0][2] + return trait_ids, trait_names, data_type, dataset_id + + elif "Publish" in dataset_name or get_group_id(dataset_name): + data_type = "Publish" + dataset_name = dataset_name.replace("Publish", "") + dataset_id = get_group_id(dataset_name) + cursor.execute( + "SELECT PublishXRef.PhenotypeId, " + "PublishXRef.Id, InbredSet.InbredSetCode " + "FROM PublishXRef, InbredSet WHERE " + "PublishXRef.InbredSetId = %s AND " + "InbredSet.Id = PublishXRef.InbredSetId " + f"{limit_string}", + (dataset_id,) + ) + results = cursor.fetchall() + + trait_ids = [result[0] for result in results] + trait_names = [str(result[2]) + "_" + str(result[1]) + for result in results] + + return trait_ids, trait_names, data_type, dataset_id + + else: + data_type = "ProbeSet" + cursor.execute( + "SELECT ProbeSetXRef.ProbeSetId, " + "ProbeSet.Name, ProbeSetXRef.ProbeSetFreezeId " + "FROM ProbeSet, ProbeSetXRef, " + "ProbeSetFreeze WHERE " + "ProbeSet.Id = ProbeSetXRef.ProbeSetId AND " + "ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id " + f"AND ProbeSetFreeze.Name = %s {limit_string}", + (dataset_name,) + ) + results = cursor.fetchall() + trait_ids = [result[0] for result in results] + trait_names = [result[1] for result in results] + dataset_id = results[0][2] + return trait_ids, trait_names, data_type, dataset_id + + +def get_samplelist(dataset_name): + group_id = get_group_id_from_dataset(dataset_name) + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + cursor.execute( + "SELECT Strain.Name FROM Strain, StrainXRef " + "WHERE StrainXRef.StrainId = Strain.Id AND " + "StrainXRef.InbredSetId = %s", + (group_id,) + ) + # sample list + return [result[0] for result in cursor.fetchall()] + + +def get_group_id_from_dataset(dataset_name): + result = () + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + if "Publish" in dataset_name: + cursor.execute( + "SELECT InbredSet.Id FROM " + "InbredSet, PublishFreeze " + "WHERE PublishFreeze.InbredSetId = InbredSet.Id " + "AND PublishFreeze.Name = %s", + (dataset_name,) + ) + elif "Geno" in dataset_name: + cursor.execute( + "SELECT InbredSet.Id FROM " + "InbredSet, GenoFreeze WHERE " + "GenoFreeze.InbredSetId = InbredSet.Id " + "AND GenoFreeze.Name = %s", + (dataset_name,) + ) + else: + cursor.execute( + "SELECT InbredSet.Id FROM " + "InbredSet, ProbeSetFreeze, " + "ProbeFreeze WHERE " + "ProbeFreeze.InbredSetId = InbredSet.Id " + "AND ProbeFreeze.Id = ProbeSetFreeze.ProbeFreezeId " + "AND ProbeSetFreeze.Name = %s", + (dataset_name,) + ) + if result := cursor.fetchone(): + return result[0] + return None + + +def get_group_id(group_name): + with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor: + cursor.execute( + "SELECT InbredSet.Id FROM InbredSet " + "WHERE InbredSet.Name = %s", + (group_name,) + ) + if group_id := cursor.fetchone(): + return group_id[0] + return None |