aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2021-11-22 12:03:21 +0300
committerFrederick Muriuki Muriithi2021-11-22 12:06:47 +0300
commit55d698b1fb07afe74bf1dd570f9f495aefea1086 (patch)
tree9e181f60c77d11498b1c2514c47bf3f626e658d6
parent854416ba850b7793aa9aa95528b89bc69df26888 (diff)
downloadgenenetwork3-55d698b1fb07afe74bf1dd570f9f495aefea1086.tar.gz
Migrate `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function
Issue: https://github.com/genenetwork/gn-gemtext-threads/blob/main/topics/gn1-migration-to-gn2/partial-correlations.gmi * Migrate the `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function from GN1 to GN3.
-rw-r--r--gn3/db/correlations.py147
1 files changed, 147 insertions, 0 deletions
diff --git a/gn3/db/correlations.py b/gn3/db/correlations.py
index f327dc3..ff570b4 100644
--- a/gn3/db/correlations.py
+++ b/gn3/db/correlations.py
@@ -379,3 +379,150 @@ def check_symbol_for_tissue_correlation(
return True
return False
+
+def fetch_sample_ids(
+ conn: Any, sample_names: Tuple[str, ...], species_name: str) -> Tuple[
+ int, ...]:
+ """
+ Given a sequence of sample names, and a species name, return the sample ids
+ that correspond to both.
+
+ This is a partial migration of the
+ `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in
+ GeneNetwork1.
+ """
+ query = (
+ "SELECT Strain.Id FROM Strain, Species "
+ "WHERE Strain.Name IN %(samples_names)s "
+ "AND Strain.SpeciesId=Species.Id "
+ "AND Species.name=%(species_name)s")
+ with conn.cursor() as cursor:
+ cursor.execute(
+ query, samples_names=tuple(samples),
+ species_name=species)
+ return cursor.fetchall()
+
+def fetch_all_database_data(
+ conn: Any, species: str, gene_id: int, gene_symbol: str,
+ samples: Tuple[str, ...], db_type: str, db_name: str, method: str,
+ returnNumber: int, tissueProbeSetFreezeId: int) -> Tuple[Any, Any]:
+ """
+ This is a migration of the
+ `web.webqtl.correlation.CorrelationPage.fetchAllDatabaseData` function in
+ GeneNetwork1.
+ """
+ def __build_query_sgo_lit__(temp_table, sample_id_columns, joins):
+ return (
+ (f"SELECT {db_type}.Name, {temp_table}.value " +
+ sample_id_columns +
+ f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " +
+ f"LEFT JOIN {temp_table} ON {temp_table}.GeneId2=ProbeSet.GeneId " +
+ " ".join(joins) +
+ f" WHERE ProbeSet.GeneId IS NOT NULL " +
+ f"AND {temp_table}.value IS NOT NULL " +
+ f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " +
+ f"AND {db_type}Freeze.Name = %(db_name)s " +
+ f"AND {db_type}.Id = {db_type}XRef.{db_type}Id " +
+ f"ORDER BY {db_type}.Id"),
+ 2)
+
+ def __build_query_tissue_corr__(temp_table, sample_id_columns, joins):
+ return (
+ (f"SELECT {db_type}.Name, {temp_table}.Correlation, " +
+ f"{temp_table}.PValue, " +
+ sample_id_columns +
+ f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " +
+ f"LEFT JOIN {temp_table} ON {temp_table}.Symbol=ProbeSet.Symbol " +
+ " ".join(joins) +
+ f" WHERE ProbeSet.Symbol IS NOT NULL " +
+ f"AND {temp_table}.Correlation IS NOT NULL " +
+ f"AND {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " +
+ f"AND {db_type}Freeze.Name = %(db_name)s " +
+ f"AND {db_type}.Id = {db_type}XRef.%sId "
+ f"ORDER BY {db_type}.Id"),
+ 3)
+
+ def __build_query__(sample_ids, temp_table):
+ sample_id_columns = ", ".join(f"T{smpl}.value" for smpl in samples_ids)
+ if db_type == "Publish":
+ joins = tuple(
+ ("LEFT JOIN PublishData AS T{item} "
+ "ON T{item}.Id = PublishXRef.DataId "
+ "AND T{item}.StrainId = %(T{item}_sample_id)s")
+ for item in sample_ids)
+ return (
+ ("SELECT PublishXRef.Id, " +
+ sample_id_columns +
+ "FROM (PublishXRef, PublishFreeze) " +
+ " ".join(joins) +
+ " WHERE PublishXRef.InbredSetId = PublishFreeze.InbredSetId "
+ "AND PublishFreeze.Name = %(db_name)s"),
+ 1)
+ if temp_table is not None:
+ joins = tuple(
+ ("LEFT JOIN {db_type}Data AS T{item} "
+ "ON T{item}.Id = {db_type}XRef.DataId "
+ "AND T{item}.StrainId=%(T{item}_sample_id)s")
+ for item in sample_ids)
+ if method.lower() == "sgo literature correlation":
+ return __build_query_sgo_lit__(
+ sample_ids, temp_table, sample_id_columns)
+ if method.lower() in (
+ "tissue correlation, pearson's r",
+ "tissue correlation, spearman's rho"):
+ return __build_query_tissue_corr__(
+ sample_ids, temp_table, sample_id_columns)
+ joins = tuple(
+ (f"LEFT JOIN {db_type}Data AS T{item} "
+ f"ON T{item}.Id = {db_type}XRef.DataId "
+ f"AND T{item}.StrainId = %(T{item}_sample_id)s")
+ for item in sample_ids)
+ return (
+ (
+ f"SELECT {db_type}.Name, " +
+ sample_id_columns +
+ f" FROM ({db_type}, {db_type}XRef, {db_type}Freeze) " +
+ " ".join(joins) +
+ f" WHERE {db_type}XRef.{db_type}FreezeId = {db_type}Freeze.Id " +
+ f"AND {db_type}Freeze.Name = %(db_name)s " +
+ f"AND {db_type}.Id = {db_type}XRef.{db_type}Id " +
+ f"ORDER BY {db_type}.Id"),
+ 1)
+
+ def __fetch_data__(sample_ids, temp_table):
+ query, data_start_pos = __build_query__(sample_ids, temp_table)
+ with conn.cursor() as cursor:
+ cursor.execute(
+ query, db_name=db_name,
+ **{f"T{item}_sample_id": item for item in sample_ids})
+ return cursor.fetchall()
+
+ sample_ids = tuple(
+ # look into graduating this to an argument and removing the `samples`
+ # and `species` argument: function currying and compositions might help
+ # with this
+ f"{sample_id}" for sample_id in
+ fetch_sample_ids(conn, samples, species))
+
+ temp_table = None
+ if gene_id and db_type == "probeset":
+ if method.lower() == "sgo literature correlation":
+ temp_table = build_temporary_literature_table(
+ conn, species, gene_id, return_number)
+ if method.lower() in (
+ "tissue correlation, pearson's r",
+ "tissue correlation, spearman's rho"):
+ temp_table = build_temporary_tissue_correlations_table(
+ conn, trait_symbol, probeset_freeze_id, method, return_number)
+
+ trait_database = tuple(
+ item for sublist in
+ (__fetch_data__(ssample_ids, temp_table)
+ for ssample_ids in partition_all(25, sample_ids))
+ for item in sublist)
+
+ if temp_table:
+ with conn.cursor() as cursor:
+ cursor.execute(f"DROP TEMPORARY TABLE {temp_table}")
+
+ return trait_database, data_start_pos