aboutsummaryrefslogtreecommitdiff
path: root/gn3
diff options
context:
space:
mode:
Diffstat (limited to 'gn3')
-rw-r--r--gn3/api/correlation.py12
-rw-r--r--gn3/api/datasets.py44
-rw-r--r--gn3/api/traits.py53
-rw-r--r--gn3/app.py4
-rw-r--r--gn3/computations/correlations.py13
-rw-r--r--gn3/computations/datasets.py323
-rw-r--r--gn3/computations/traits.py56
-rw-r--r--gn3/experimental_db.py11
-rw-r--r--gn3/settings.py3
9 files changed, 507 insertions, 12 deletions
diff --git a/gn3/api/correlation.py b/gn3/api/correlation.py
index 2339088..f28e1f5 100644
--- a/gn3/api/correlation.py
+++ b/gn3/api/correlation.py
@@ -33,9 +33,10 @@ def compute_sample_integration(corr_method="pearson"):
@correlation.route("/sample_r/<string:corr_method>", methods=["POST"])
def compute_sample_r(corr_method="pearson"):
- """correlation endpoint for computing sample r correlations\
+ """Correlation endpoint for computing sample r correlations\
api expects the trait data with has the trait and also the\
- target_dataset data"""
+ target_dataset data
+ """
correlation_input = request.get_json()
# xtodo move code below to compute_all_sampl correlation
@@ -53,9 +54,10 @@ def compute_sample_r(corr_method="pearson"):
@correlation.route("/lit_corr/<string:species>/<int:gene_id>", methods=["POST"])
def compute_lit_corr(species=None, gene_id=None):
- """api endpoint for doing lit correlation.results for lit correlation\
+ """Api endpoint for doing lit correlation.results for lit correlation\
are fetched from the database this is the only case where the db\
- might be needed for actual computing of the correlation results"""
+ might be needed for actual computing of the correlation results
+ """
conn, _cursor_object = database_connector()
target_traits_gene_ids = request.get_json()
@@ -72,7 +74,7 @@ def compute_lit_corr(species=None, gene_id=None):
@correlation.route("/tissue_corr/<string:corr_method>", methods=["POST"])
def compute_tissue_corr(corr_method="pearson"):
- """api endpoint fr doing tissue correlation"""
+ """Api endpoint fr doing tissue correlation"""
tissue_input_data = request.get_json()
primary_tissue_dict = tissue_input_data["primary_tissue"]
target_tissues_dict = tissue_input_data["target_tissues_dict"]
diff --git a/gn3/api/datasets.py b/gn3/api/datasets.py
new file mode 100644
index 0000000..7f08de5
--- /dev/null
+++ b/gn3/api/datasets.py
@@ -0,0 +1,44 @@
+"""this module contains code for creating datasets"""
+from flask import Blueprint
+from flask import jsonify
+
+from gn3.computations.datasets import create_dataset
+from gn3.computations.datasets import get_traits_data
+from gn3.experimental_db import database_connector
+
+
+dataset = Blueprint("dataset", __name__)
+
+
+@dataset.route("/create/<dataset_name>/")
+@dataset.route("/create/<dataset_name>/<dataset_type>")
+def create_dataset_api(dataset_name, dataset_type=None):
+ """Endpoint of creating dataset"""
+
+ new_dataset = create_dataset(
+ dataset_type=dataset_type, dataset_name=dataset_name)
+
+ results = {
+ "dataset": new_dataset
+ }
+ return jsonify(results)
+
+
+@dataset.route("/fetch_traits_data/<dataset_name>/<dataset_type>")
+def fetch_traits_data(dataset_name, dataset_type):
+ """Endpoint for fetching Trait data"""
+ # should fetch this(temp)
+ trait_sample_ids = [4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15,
+ 17, 18, 19, 20, 21, 22, 24, 25, 26, 28, 29, 30, 31,
+ 35, 36, 37, 39, 98, 99, 100, 103, 487, 105, 106, 110, 115,
+ 116, 117, 118, 119, 120, 919, 147,
+ 121, 40, 41, 124, 125, 128, 135, 129, 130, 131,
+ 132, 134, 138, 139, 140, 141, 142, 144,
+ 145, 148, 149, 920, 922, 2, 3, 1, 1100]
+
+ conn, _cursor = database_connector()
+ results = get_traits_data(sample_ids=trait_sample_ids, database_instance=conn,
+ dataset_name=dataset_name, dataset_type=dataset_type)
+ conn.close()
+
+ return jsonify({"results": results})
diff --git a/gn3/api/traits.py b/gn3/api/traits.py
new file mode 100644
index 0000000..0ac437d
--- /dev/null
+++ b/gn3/api/traits.py
@@ -0,0 +1,53 @@
+"""this module contains the all endpoints for traits"""
+from unittest import mock
+
+from flask import Blueprint
+from flask import jsonify
+from flask import request
+
+from gn3.computations.traits import fetch_trait
+from gn3.computations.traits import get_trait_info_data
+from gn3.experimental_db import database_connector
+
+trait = Blueprint("trait", __name__)
+
+
+@trait.route("/<string:trait_name>/<string:dataset_name>")
+def create_trait(trait_name, dataset_name):
+ """Endpoint for creating trait and fetching strain\
+ values"""
+
+ # xtodo replace the object at most this endpoint
+ # requires dataset_type,dataset_name ,dataset_id
+ trait_dataset = {
+ "name": dataset_name,
+ "id": 12,
+ "type": "ProbeSet" # temp values
+ }
+ conn, _cursor = database_connector()
+
+ trait_results = fetch_trait(dataset=trait_dataset,
+ trait_name=trait_name,
+ database=conn)
+
+ conn.close()
+
+ return jsonify(trait_results)
+
+
+@trait.route("/trait_info/<string:trait_name>", methods=["POST"])
+def fetch_trait_info(trait_name):
+ """Api endpoint for fetching the trait info \
+ expects the trait and trait dataset to have\
+ been created """
+ data = request.get_json()
+
+ trait_dataset = data["trait_dataset"]
+ trait_data = data["trait"]
+ _trait_name = trait_name # should be used as key to return results
+
+ database_instance = mock.Mock()
+
+ results = get_trait_info_data(trait_dataset, trait_data, database_instance)
+
+ return jsonify(results)
diff --git a/gn3/app.py b/gn3/app.py
index a684d25..f0f35f9 100644
--- a/gn3/app.py
+++ b/gn3/app.py
@@ -7,6 +7,8 @@ from flask import Flask
from gn3.api.gemma import gemma
from gn3.api.general import general
from gn3.api.correlation import correlation
+from gn3.api.traits import trait
+from gn3.api.datasets import dataset
def create_app(config: Union[Dict, str, None] = None) -> Flask:
@@ -28,4 +30,6 @@ def create_app(config: Union[Dict, str, None] = None) -> Flask:
app.register_blueprint(general, url_prefix="/api/")
app.register_blueprint(gemma, url_prefix="/api/gemma")
app.register_blueprint(correlation, url_prefix="/api/correlation")
+ app.register_blueprint(trait, url_prefix="/api/trait")
+ app.register_blueprint(dataset, url_prefix="/api/dataset")
return app
diff --git a/gn3/computations/correlations.py b/gn3/computations/correlations.py
index 26b7294..7fb67be 100644
--- a/gn3/computations/correlations.py
+++ b/gn3/computations/correlations.py
@@ -109,10 +109,9 @@ package :not packaged in guix
def filter_shared_sample_keys(this_samplelist,
target_samplelist) -> Tuple[List, List]:
- """Given primary and target samplelist for two base and target trait select
-filter the values using the shared keys
-
- """
+ """Given primary and target samplelist\
+ for two base and target trait select\
+ filter the values using the shared keys"""
this_vals = []
target_vals = []
for key, value in target_samplelist.items():
@@ -125,8 +124,9 @@ filter the values using the shared keys
def compute_all_sample_correlation(this_trait,
target_dataset,
corr_method="pearson") -> List:
- """Given a trait data samplelist and target__datasets compute all sample
-correlation"""
+ """Given a trait data samplelist and\
+ target__datasets compute all sample correlation
+ """
this_trait_samples = this_trait["trait_sample_data"]
@@ -323,7 +323,6 @@ def compute_all_lit_correlation(conn, trait_lists: List,
species: str, gene_id):
"""Function that acts as an abstraction for
lit_correlation_for_trait_list"""
- # xtodo to be refactored
lit_results = lit_correlation_for_trait_list(
conn=conn,
diff --git a/gn3/computations/datasets.py b/gn3/computations/datasets.py
new file mode 100644
index 0000000..57e1fe1
--- /dev/null
+++ b/gn3/computations/datasets.py
@@ -0,0 +1,323 @@
+"""module contains the code all related to datasets"""
+import json
+from math import ceil
+from collections import defaultdict
+
+from typing import Optional
+from typing import List
+
+from dataclasses import dataclass
+from MySQLdb import escape_string # type: ignore
+
+import requests
+from gn3.settings import GN2_BASE_URL
+
+
+def retrieve_trait_sample_data(dataset,
+ trait_name: str,
+ database,
+ group_species_id=None) -> List:
+ """given the dataset id and trait_name fetch the\
+ sample_name,value from the dataset"""
+
+ # should pass the db as arg all do a setup
+
+ (dataset_name, dataset_id, dataset_type) = (dataset.get("name"), dataset.get(
+ "id"), dataset.get("type"))
+
+ dataset_query = get_query_for_dataset_sample(dataset_type)
+ results = []
+ sample_query_values = {
+ "Publish": (trait_name, dataset_id),
+ "Geno": (group_species_id, trait_name, dataset_name),
+ "ProbeSet": (trait_name, dataset_name)
+ }
+
+ if dataset_query:
+ formatted_query = dataset_query % sample_query_values[dataset_type]
+
+ results = fetch_from_db_sample_data(formatted_query, database)
+
+ return results
+
+
+def fetch_from_db_sample_data(formatted_query: str, database_instance) -> List:
+ """this is the function that does the actual fetching of\
+ results from the database"""
+ try:
+ cursor = database_instance.cursor()
+ cursor.execute(formatted_query)
+ results = cursor.fetchall()
+
+ except Exception as error:
+ raise error
+
+ cursor.close()
+
+ return results
+
+
+def get_query_for_dataset_sample(dataset_type) -> Optional[str]:
+ """this functions contains querys for\
+ getting sample data from the db depending in
+ dataset"""
+ dataset_query = {}
+
+ pheno_query = """
+ SELECT
+ Strain.Name, PublishData.value, PublishSE.error,NStrain.count, Strain.Name2
+ FROM
+ (PublishData, Strain, PublishXRef, PublishFreeze)
+ left join PublishSE on
+ (PublishSE.DataId = PublishData.Id AND PublishSE.StrainId = PublishData.StrainId)
+ left join NStrain on
+ (NStrain.DataId = PublishData.Id AND
+ NStrain.StrainId = PublishData.StrainId)
+ WHERE
+ PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND
+ PublishData.Id = PublishXRef.DataId AND PublishXRef.Id = %s AND
+ PublishFreeze.Id = %s AND PublishData.StrainId = Strain.Id
+ Order BY
+ Strain.Name
+ """
+ geno_query = """
+ SELECT
+ Strain.Name, GenoData.value, GenoSE.error, "N/A", Strain.Name2
+ FROM
+ (GenoData, GenoFreeze, Strain, Geno, GenoXRef)
+ left join GenoSE on
+ (GenoSE.DataId = GenoData.Id AND GenoSE.StrainId = GenoData.StrainId)
+ WHERE
+ Geno.SpeciesId = %s AND Geno.Name = %s AND GenoXRef.GenoId = Geno.Id AND
+ GenoXRef.GenoFreezeId = GenoFreeze.Id AND
+ GenoFreeze.Name = %s AND
+ GenoXRef.DataId = GenoData.Id AND
+ GenoData.StrainId = Strain.Id
+ Order BY
+ Strain.Name
+ """
+
+ probeset_query = """
+ SELECT
+ Strain.Name, ProbeSetData.value, ProbeSetSE.error, NStrain.count, Strain.Name2
+ FROM
+ (ProbeSetData, ProbeSetFreeze,
+ Strain, ProbeSet, ProbeSetXRef)
+ left join ProbeSetSE on
+ (ProbeSetSE.DataId = ProbeSetData.Id AND ProbeSetSE.StrainId = ProbeSetData.StrainId)
+ left join NStrain on
+ (NStrain.DataId = ProbeSetData.Id AND
+ NStrain.StrainId = ProbeSetData.StrainId)
+ WHERE
+ ProbeSet.Name = '%s' AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND
+ ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND
+ ProbeSetFreeze.Name = '%s' AND
+ ProbeSetXRef.DataId = ProbeSetData.Id AND
+ ProbeSetData.StrainId = Strain.Id
+ Order BY
+ Strain.Name
+ """
+
+ dataset_query["Publish"] = pheno_query
+ dataset_query["Geno"] = geno_query
+ dataset_query["ProbeSet"] = probeset_query
+
+ return dataset_query.get(dataset_type)
+
+
+@dataclass
+class Dataset:
+ """class for creating datasets"""
+ name: Optional[str] = None
+ dataset_type: Optional[str] = None
+ dataset_id: int = -1
+
+
+def create_mrna_tissue_dataset(dataset_name, dataset_type):
+ """an mrna assay is a quantitative assessment(assay) associated\
+ with an mrna trait.This used to be called probeset,but that term\
+ only referes specifically to the afffymetrix platform and is\
+ far too speficified"""
+
+ return Dataset(name=dataset_name, dataset_type=dataset_type)
+
+
+def dataset_type_getter(dataset_name, redis_instance=None) -> Optional[str]:
+ """given the dataset name fetch the type\
+ of the dataset this in turn enables fetching\
+ the creation of the correct object could utilize\
+ redis for the case"""
+
+ results = redis_instance.get(dataset_name, None)
+
+ if results:
+ return results
+
+ return fetch_dataset_type_from_gn2_api(dataset_name)
+
+
+def fetch_dataset_type_from_gn2_api(dataset_name):
+ """this function is only called when the\
+ the redis is empty and does have the specificied\
+ dataset_type"""
+ # should only run once
+
+ dataset_structure = {}
+
+ map_dataset_to_new_type = {
+ "Phenotypes": "Publish",
+ "Genotypes": "Geno",
+ "MrnaTypes": "ProbeSet"
+ }
+
+ data = json.loads(requests.get(
+ GN2_BASE_URL + "/api/v_pre1/gen_dropdown", timeout=5).content)
+ _name = dataset_name
+ for species in data['datasets']:
+ for group in data['datasets'][species]:
+ for dataset_type in data['datasets'][species][group]:
+ for dataset in data['datasets'][species][group][dataset_type]:
+ # assumes the first is dataset_short_name
+ short_dataset_name = next(
+ item for item in dataset if item != "None" and item is not None)
+
+ dataset_structure[short_dataset_name] = map_dataset_to_new_type.get(
+ dataset_type, "MrnaTypes")
+ return dataset_structure
+
+
+def dataset_creator_store(dataset_type):
+ """function contains key value pairs for\
+ the function need to be called to create\
+ each dataset_type"""
+
+ dataset_obj = {
+ "ProbeSet": create_mrna_tissue_dataset
+ }
+
+ return dataset_obj[dataset_type]
+
+
+def create_dataset(dataset_type=None, dataset_name: str = None):
+ """function for creating new dataset temp not implemented"""
+ if dataset_type is None:
+ dataset_type = dataset_type_getter(dataset_name)
+
+ dataset_creator = dataset_creator_store(dataset_type)
+ results = dataset_creator(
+ dataset_name=dataset_name, dataset_type=dataset_type)
+ return results
+
+
+def fetch_dataset_sample_id(samplelist: List, database, species: str) -> dict:
+ """fetch the strain ids from the db only if\
+ it is in the samplelist"""
+ # xtodo create an in clause for samplelist
+
+ strain_query = """
+ SELECT Strain.Name, Strain.Id FROM Strain, Species
+ WHERE Strain.Name IN {}
+ and Strain.SpeciesId=Species.Id
+ and Species.name = '{}'
+ """
+
+ database_cursor = database.cursor()
+ database_cursor.execute(strain_query.format(samplelist, species))
+
+ results = database_cursor.fetchall()
+
+ return dict(results)
+
+
+def divide_into_chunks(the_list, number_chunks):
+ """Divides a list into approximately number_chunks
+ >>> divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 3)
+ [[1, 2, 7], [3, 22, 8], [5, 22, 333]]"""
+
+ length = len(the_list)
+ if length == 0:
+ return [[]]
+
+ if length <= number_chunks:
+ number_chunks = length
+ chunk_size = int(ceil(length/number_chunks))
+ chunks = []
+
+ for counter in range(0, length, chunk_size):
+ chunks.append(the_list[counter:counter+chunk_size])
+ return chunks
+
+
+def escape(string_):
+ """function escape sql value"""
+ return escape_string(string_).decode('utf8')
+
+
+def mescape(*items) -> List:
+ """multiple escape for query values"""
+
+ return [escape_string(str(item)).decode('utf8') for item in items]
+
+
+def get_traits_data(sample_ids, database_instance, dataset_name, dataset_type):
+ """function to fetch trait data"""
+ # MySQL limits the number of tables that can be used in a join to 61,
+ # so we break the sample ids into smaller chunks
+ # Postgres doesn't have that limit, so we can get rid of this after we transition
+
+ _trait_data = defaultdict(list)
+ chunk_size = 61
+ number_chunks = int(ceil(len(sample_ids) / chunk_size))
+ for sample_ids_step in divide_into_chunks(sample_ids, number_chunks):
+ if dataset_type == "Publish":
+ full_dataset_type = "Phenotype"
+ else:
+ full_dataset_type = dataset_type
+ temp = ['T%s.value' % item for item in sample_ids_step]
+
+ if dataset_type == "Publish":
+ query = "SELECT {}XRef.Id,".format(escape(dataset_type))
+
+ else:
+ query = "SELECT {}.Name,".format(escape(full_dataset_type))
+
+ query += ', '.join(temp)
+ query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(full_dataset_type,
+ dataset_type,
+ dataset_type))
+ for item in sample_ids_step:
+
+ query += """
+ left join {}Data as T{} on T{}.Id = {}XRef.DataId
+ and T{}.StrainId={}\n
+ """.format(*mescape(dataset_type, item,
+ item, dataset_type, item, item))
+
+ if dataset_type == "Publish":
+ query += """
+ WHERE {}XRef.{}FreezeId = {}Freeze.Id
+ and {}Freeze.Name = '{}'
+ and {}.Id = {}XRef.{}Id
+ order by {}.Id
+ """.format(*mescape(dataset_type, dataset_type,
+ dataset_type, dataset_type,
+ dataset_name, full_dataset_type,
+ dataset_type, dataset_type,
+ full_dataset_type))
+
+ else:
+ query += """
+ WHERE {}XRef.{}FreezeId = {}Freeze.Id
+ and {}Freeze.Name = '{}'
+ and {}.Id = {}XRef.{}Id
+ order by {}.Id
+ """.format(*mescape(dataset_type, dataset_type,
+ dataset_type, dataset_type,
+ dataset_name, dataset_type,
+ dataset_type, dataset_type,
+ full_dataset_type))
+
+ # print(query)
+
+ _results = fetch_from_db_sample_data(query, database_instance)
+ return {}
diff --git a/gn3/computations/traits.py b/gn3/computations/traits.py
new file mode 100644
index 0000000..1aa2970
--- /dev/null
+++ b/gn3/computations/traits.py
@@ -0,0 +1,56 @@
+"""module contains all operating related to traits"""
+from gn3.computations.datasets import retrieve_trait_sample_data
+
+
+def fetch_trait(dataset, trait_name: str, database) -> dict:
+ """this method creates a trait by\
+ fetching required data given the\
+ dataset and trait_name"""
+
+ created_trait = {
+ "dataset": dataset,
+ "trait_name": trait_name
+ }
+
+ trait_data = get_trait_sample_data(dataset, trait_name, database)
+
+ created_trait["trait_data"] = trait_data
+
+ return created_trait
+
+
+def get_trait_sample_data(trait_dataset, trait_name, database) -> dict:
+ """first try to fetch the traits sample data from redis if that\
+ try to fetch from the traits dataset redis is only used for\
+ temp dataset type which is not used in this case """
+
+ sample_results = retrieve_trait_sample_data(
+ trait_dataset, trait_name, database)
+
+ trait_data = {}
+
+ for (name, sample_value, _variance, _numcase, _name2) in sample_results:
+
+ trait_data[name] = sample_value
+ return trait_data
+
+
+def get_trait_info_data(trait_dataset,
+ trait_name: str,
+ database_instance,
+ get_qtl_info: bool = False) -> dict:
+ """given a dataset and trait_name return a dict containing all info\
+ regarding the get trait"""
+
+ _temp_var_holder = (trait_dataset, trait_name,
+ database_instance, get_qtl_info)
+ trait_info_data = {
+ "description": "",
+ "chr": "",
+ "locus": "",
+ "mb": "",
+ "abbreviation": "",
+ "trait_display_name": ""
+
+ }
+ return trait_info_data
diff --git a/gn3/experimental_db.py b/gn3/experimental_db.py
new file mode 100644
index 0000000..a07aeba
--- /dev/null
+++ b/gn3/experimental_db.py
@@ -0,0 +1,11 @@
+"""this function contains experimental db staff"""
+from typing import Tuple
+import MySQLdb as mdb # type: ignore
+
+
+def database_connector()->Tuple:
+ """function to create db connector"""
+ conn = mdb.connect("localhost", "kabui", "1234", "db_webqtl")
+ cursor = conn.cursor()
+
+ return (conn, cursor)
diff --git a/gn3/settings.py b/gn3/settings.py
index e77a977..478a041 100644
--- a/gn3/settings.py
+++ b/gn3/settings.py
@@ -15,3 +15,6 @@ TMPDIR = os.environ.get("TMPDIR", tempfile.gettempdir())
SQL_URI = os.environ.get("SQL_URI", "mysql://kabui:1234@localhost/db_webqtl")
SECRET_KEY = "password"
SQLALCHEMY_TRACK_MODIFICATIONS = False
+# gn2 results only used in fetching dataset info
+
+GN2_BASE_URL = "http://www.genenetwork.org/"