aboutsummaryrefslogtreecommitdiff
path: root/gn3/computations/datasets.py
diff options
context:
space:
mode:
Diffstat (limited to 'gn3/computations/datasets.py')
-rw-r--r--gn3/computations/datasets.py371
1 files changed, 0 insertions, 371 deletions
diff --git a/gn3/computations/datasets.py b/gn3/computations/datasets.py
deleted file mode 100644
index b69583e..0000000
--- a/gn3/computations/datasets.py
+++ /dev/null
@@ -1,371 +0,0 @@
-"""module contains the code all related to datasets"""
-import json
-from math import ceil
-from collections import defaultdict
-
-from typing import Optional
-from typing import List
-
-from dataclasses import dataclass
-from MySQLdb import escape_string # type: ignore
-
-import requests
-from gn3.settings import GN2_BASE_URL
-
-
-def retrieve_trait_sample_data(dataset,
- trait_name: str,
- database,
- group_species_id=None) -> List:
- """given the dataset id and trait_name fetch the\
- sample_name,value from the dataset"""
-
- # should pass the db as arg all do a setup
-
- (dataset_name, dataset_id, dataset_type) = (dataset.get("name"), dataset.get(
- "id"), dataset.get("type"))
-
- dataset_query = get_query_for_dataset_sample(dataset_type)
- results = []
- sample_query_values = {
- "Publish": (trait_name, dataset_id),
- "Geno": (group_species_id, trait_name, dataset_name),
- "ProbeSet": (trait_name, dataset_name)
- }
-
- if dataset_query:
- formatted_query = dataset_query % sample_query_values[dataset_type]
-
- results = fetch_from_db_sample_data(formatted_query, database)
-
- return results
-
-
-def fetch_from_db_sample_data(formatted_query: str, database_instance) -> List:
- """this is the function that does the actual fetching of\
- results from the database"""
- try:
- cursor = database_instance.cursor()
- cursor.execute(formatted_query)
- results = cursor.fetchall()
-
- except Exception as error:
- raise error
-
- cursor.close()
-
- return results
-
-
-def get_query_for_dataset_sample(dataset_type) -> Optional[str]:
- """this functions contains querys for\
- getting sample data from the db depending in
- dataset"""
- dataset_query = {}
-
- pheno_query = """
- SELECT
- Strain.Name, PublishData.value, PublishSE.error,NStrain.count, Strain.Name2
- FROM
- (PublishData, Strain, PublishXRef, PublishFreeze)
- left join PublishSE on
- (PublishSE.DataId = PublishData.Id AND PublishSE.StrainId = PublishData.StrainId)
- left join NStrain on
- (NStrain.DataId = PublishData.Id AND
- NStrain.StrainId = PublishData.StrainId)
- WHERE
- PublishXRef.InbredSetId = PublishFreeze.InbredSetId AND
- PublishData.Id = PublishXRef.DataId AND PublishXRef.Id = %s AND
- PublishFreeze.Id = %s AND PublishData.StrainId = Strain.Id
- Order BY
- Strain.Name
- """
- geno_query = """
- SELECT
- Strain.Name, GenoData.value, GenoSE.error, "N/A", Strain.Name2
- FROM
- (GenoData, GenoFreeze, Strain, Geno, GenoXRef)
- left join GenoSE on
- (GenoSE.DataId = GenoData.Id AND GenoSE.StrainId = GenoData.StrainId)
- WHERE
- Geno.SpeciesId = %s AND Geno.Name = %s AND GenoXRef.GenoId = Geno.Id AND
- GenoXRef.GenoFreezeId = GenoFreeze.Id AND
- GenoFreeze.Name = %s AND
- GenoXRef.DataId = GenoData.Id AND
- GenoData.StrainId = Strain.Id
- Order BY
- Strain.Name
- """
-
- probeset_query = """
- SELECT
- Strain.Name, ProbeSetData.value, ProbeSetSE.error, NStrain.count, Strain.Name2
- FROM
- (ProbeSetData, ProbeSetFreeze,
- Strain, ProbeSet, ProbeSetXRef)
- left join ProbeSetSE on
- (ProbeSetSE.DataId = ProbeSetData.Id AND ProbeSetSE.StrainId = ProbeSetData.StrainId)
- left join NStrain on
- (NStrain.DataId = ProbeSetData.Id AND
- NStrain.StrainId = ProbeSetData.StrainId)
- WHERE
- ProbeSet.Name = '%s' AND ProbeSetXRef.ProbeSetId = ProbeSet.Id AND
- ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id AND
- ProbeSetFreeze.Name = '%s' AND
- ProbeSetXRef.DataId = ProbeSetData.Id AND
- ProbeSetData.StrainId = Strain.Id
- Order BY
- Strain.Name
- """
-
- dataset_query["Publish"] = pheno_query
- dataset_query["Geno"] = geno_query
- dataset_query["ProbeSet"] = probeset_query
-
- return dataset_query.get(dataset_type)
-
-
-@dataclass
-class Dataset:
- """class for creating datasets"""
- name: Optional[str] = None
- dataset_type: Optional[str] = None
- dataset_id: int = -1
-
-
-def create_mrna_tissue_dataset(dataset_name, dataset_type):
- """an mrna assay is a quantitative assessment(assay) associated\
- with an mrna trait.This used to be called probeset,but that term\
- only referes specifically to the afffymetrix platform and is\
- far too speficified"""
-
- return Dataset(name=dataset_name, dataset_type=dataset_type)
-
-
-def dataset_type_getter(dataset_name, redis_instance=None) -> Optional[str]:
- """given the dataset name fetch the type\
- of the dataset this in turn enables fetching\
- the creation of the correct object could utilize\
- redis for the case"""
-
- results = redis_instance.get(dataset_name, None)
-
- if results:
- return results
-
- return fetch_dataset_type_from_gn2_api(dataset_name)
-
-
-def fetch_dataset_type_from_gn2_api(dataset_name):
- """this function is only called when the\
- the redis is empty and does have the specificied\
- dataset_type"""
- # should only run once
-
- dataset_structure = {}
-
- map_dataset_to_new_type = {
- "Phenotypes": "Publish",
- "Genotypes": "Geno",
- "MrnaTypes": "ProbeSet"
- }
-
- data = json.loads(requests.get(
- GN2_BASE_URL + "/api/v_pre1/gen_dropdown", timeout=5).content)
- _name = dataset_name
- for species in data['datasets']:
- for group in data['datasets'][species]:
- for dataset_type in data['datasets'][species][group]:
- for dataset in data['datasets'][species][group][dataset_type]:
- # assumes the first is dataset_short_name
- short_dataset_name = next(
- item for item in dataset if item != "None" and item is not None)
-
- dataset_structure[short_dataset_name] = map_dataset_to_new_type.get(
- dataset_type, "MrnaTypes")
- return dataset_structure
-
-
-def dataset_creator_store(dataset_type):
- """function contains key value pairs for\
- the function need to be called to create\
- each dataset_type"""
-
- dataset_obj = {
- "ProbeSet": create_mrna_tissue_dataset
- }
-
- return dataset_obj[dataset_type]
-
-
-def create_dataset(dataset_type=None, dataset_name: str = None):
- """function for creating new dataset temp not implemented"""
- if dataset_type is None:
- dataset_type = dataset_type_getter(dataset_name)
-
- dataset_creator = dataset_creator_store(dataset_type)
- results = dataset_creator(
- dataset_name=dataset_name, dataset_type=dataset_type)
- return results
-
-
-def fetch_dataset_sample_id(samplelist: List, database, species: str) -> dict:
- """fetch the strain ids from the db only if\
- it is in the samplelist"""
- # xtodo create an in clause for samplelist
-
- strain_query = """
- SELECT Strain.Name, Strain.Id FROM Strain, Species
- WHERE Strain.Name IN {}
- and Strain.SpeciesId=Species.Id
- and Species.name = '{}'
- """
-
- database_cursor = database.cursor()
- database_cursor.execute(strain_query.format(samplelist, species))
-
- results = database_cursor.fetchall()
-
- return dict(results)
-
-
-def divide_into_chunks(the_list, number_chunks):
- """Divides a list into approximately number_chunks
- >>> divide_into_chunks([1, 2, 7, 3, 22, 8, 5, 22, 333], 3)
- [[1, 2, 7], [3, 22, 8], [5, 22, 333]]"""
-
- length = len(the_list)
- if length == 0:
- return [[]]
-
- if length <= number_chunks:
- number_chunks = length
- chunk_size = int(ceil(length/number_chunks))
- chunks = []
-
- for counter in range(0, length, chunk_size):
- chunks.append(the_list[counter:counter+chunk_size])
- return chunks
-
-
-def escape(string_):
- """function escape sql value"""
- return escape_string(string_).decode('utf8')
-
-
-def mescape(*items) -> List:
- """multiple escape for query values"""
-
- return [escape_string(str(item)).decode('utf8') for item in items]
-
-
-def get_traits_data(sample_ids, database_instance, dataset_name, dataset_type):
- """function to fetch trait data"""
- # MySQL limits the number of tables that can be used in a join to 61,
- # so we break the sample ids into smaller chunks
- # Postgres doesn't have that limit, so we can get rid of this after we transition
-
- _trait_data = defaultdict(list)
- chunk_size = 61
- number_chunks = int(ceil(len(sample_ids) / chunk_size))
- for sample_ids_step in divide_into_chunks(sample_ids, number_chunks):
- if dataset_type == "Publish":
- full_dataset_type = "Phenotype"
- else:
- full_dataset_type = dataset_type
- temp = ['T%s.value' % item for item in sample_ids_step]
-
- if dataset_type == "Publish":
- query = "SELECT {}XRef.Id,".format(escape(dataset_type))
-
- else:
- query = "SELECT {}.Name,".format(escape(full_dataset_type))
-
- query += ', '.join(temp)
- query += ' FROM ({}, {}XRef, {}Freeze) '.format(*mescape(full_dataset_type,
- dataset_type,
- dataset_type))
- for item in sample_ids_step:
-
- query += """
- left join {}Data as T{} on T{}.Id = {}XRef.DataId
- and T{}.StrainId={}\n
- """.format(*mescape(dataset_type, item,
- item, dataset_type, item, item))
-
- if dataset_type == "Publish":
- query += """
- WHERE {}XRef.{}FreezeId = {}Freeze.Id
- and {}Freeze.Name = '{}'
- and {}.Id = {}XRef.{}Id
- order by {}.Id
- """.format(*mescape(dataset_type, dataset_type,
- dataset_type, dataset_type,
- dataset_name, full_dataset_type,
- dataset_type, dataset_type,
- full_dataset_type))
-
- else:
- query += """
- WHERE {}XRef.{}FreezeId = {}Freeze.Id
- and {}Freeze.Name = '{}'
- and {}.Id = {}XRef.{}Id
- order by {}.Id
- """.format(*mescape(dataset_type, dataset_type,
- dataset_type, dataset_type,
- dataset_name, dataset_type,
- dataset_type, dataset_type,
- full_dataset_type))
-
- # print(query)
-
- _results = fetch_from_db_sample_data(query, database_instance)
-
- return []
-
-
-def get_probeset_trait_data(strain_ids: List, conn, dataset_name) -> dict:
- """function for getting trait data\
- for probeset data type similar to\
- get trait data only difference is that\
- it uses sub queries"""
-
- trait_data: dict = {}
-
- trait_id_name = {}
-
- traits_query = """
- SELECT ProbeSetXRef.DataId,ProbeSet.Name FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
- WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
- and ProbeSetFreeze.Name = '{}'
- and ProbeSet.Id = ProbeSetXRef.ProbeSetId
- order by ProbeSet.Id
- """.format(dataset_name)
-
- query = """
- SELECT * from ProbeSetData
- where StrainID in ({})
- and id in (SELECT ProbeSetXRef.DataId FROM (ProbeSet, ProbeSetXRef, ProbeSetFreeze)
- WHERE ProbeSetXRef.ProbeSetFreezeId = ProbeSetFreeze.Id
- and ProbeSetFreeze.Name = '{}'
- and ProbeSet.Id = ProbeSetXRef.ProbeSetId
- order by ProbeSet.Id)
- """.format(",".join(str(strain_id) for strain_id in strain_ids), dataset_name)
-
- with conn:
- cursor = conn.cursor()
- cursor.execute(query)
- _results = cursor.fetchall()
- cursor.execute(traits_query)
- trait_id_name = dict(cursor.fetchall())
-
- for trait_id, _strain_id, strain_value in _results:
- trait_name = trait_id_name[trait_id]
- if trait_data.get(trait_name):
- trait_data[trait_name].append(strain_value)
- else:
- trait_data[trait_name] = []
-
- trait_data[trait_name].append(strain_value)
-
- return trait_data