about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--scripts/compute-phenotype-means.py105
-rw-r--r--scripts/load_phenotypes_to_db.py2
2 files changed, 107 insertions, 0 deletions
diff --git a/scripts/compute-phenotype-means.py b/scripts/compute-phenotype-means.py
new file mode 100644
index 0000000..3b876b7
--- /dev/null
+++ b/scripts/compute-phenotype-means.py
@@ -0,0 +1,105 @@
+"""Compute phenotype means."""
+import sys
+import logging
+from pathlib import Path
+from typing import TypeVar
+from argparse import Namespace, ArgumentParser
+
+import MySQLdb
+
+from gn_libs import mysqldb
+from uploader import setup_modules_logging
+from .load_phenotypes_to_db import update_means
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(
+        encoding="utf-8",
+        format="%(asctime)s - %(name)s - %(levelname)s — %(message)s",
+        level=logging.INFO)
+
+
+def fetch_xref_id(conn: mysqldb.Connection, population_id: int) -> tuple[int, ...]:
+    """Fetch a population's cross-reference IDs."""
+    logger.debug("Fetching the xref IDs.")
+    with conn.cursor(cursorclass=MySQLdb.cursors.DictCursor) as cursor:
+        query = "SELECT Id FROM PublishXRef WHERE InbredSetId=%(population_id)s"
+        cursor.execute(query, {"population_id": population_id})
+        return tuple(int(row["Id"]) for row in cursor.fetchall())
+
+
+def run(args) -> int:
+    """Run the script."""
+    logger.debug("Running the script!")
+    with mysqldb.database_connection(args.db_uri) as mariadb_conn:
+        xref_ids = args.cross_ref_ids or fetch_xref_id(mariadb_conn, args.population_id)
+        if len(xref_ids):
+            update_means(mariadb_conn,
+                         args.population_id,
+                         xref_ids)
+            logger.debug("Successfully computed means for %02d phenotypes.",
+                         len(xref_ids))
+            return 0
+        _reasons = (
+            "no population exists with the ID '%s'",
+            "the population exists but it has no phenotypes linked to it yet")
+        logger.error(
+            ("No cross-reference IDs to run against. Likely causes are: "
+             + " OR ".join(_reasons) + "."),
+            args.population_id)
+        return 1
+
+
+T = TypeVar("T")
+def comma_separated_list(val: str, itemstype: T = str) -> tuple[T, ...]:
+        """Convert val into a list of items of type 'itemstype'."""
+        return tuple(itemstype(item.strip()) for item in val.split(","))
+
+
+def comma_separated_list_of_integers(val: str) -> tuple[int, ...]:
+    """Convert 'val' into list of items of type 'int'."""
+    return comma_separated_list(val, int)
+
+
+if __name__ == "__main__":
+    def parse_args() -> Namespace:
+        """Define and parse the CLI parsers accepted by this script."""
+        parser = ArgumentParser(
+            "compute-phenotype-means",
+            description="Compute/Recompute the phenotype means.")
+        parser.add_argument("db_uri",
+                            metavar="db-uri",
+                            type=str,
+                            help="MariaDB/MySQL connection URL")
+        parser.add_argument("jobs_db_path",
+                            metavar="jobs-db-path",
+                            type=Path,
+                            help="Path to jobs' SQLite database.")
+        parser.add_argument("population_id",
+                            metavar="population-id",
+                            type=int,
+                            help=("Identifier for the InbredSet group/"
+                                  "population to run means against."))
+        ## Optional arguments
+        parser.add_argument(
+            "--log-level",
+            type=str,
+            help="Determines what is logged out.",
+            choices=("debug", "info", "warning", "error", "critical"),
+            default="info")
+        parser.add_argument(
+            "--cross-ref-ids",
+            type=comma_separated_list_of_integers,
+            help=("Provide cross-reference IDs to narrow the number of "
+                  "phenotypes that the means are computed against."),
+            default=[])
+
+        return parser.parse_args()
+
+    def main() -> int:
+        """compute-phenotype-means: Entry-point function."""
+        args = parse_args()
+        logger.setLevel(getattr(logging, args.log_level.upper()))
+        setup_modules_logging(logger, ("scripts.load_phenotypes_to_db",))
+        return run(args)
+
+    sys.exit(main())
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py
index 9158307..e449b82 100644
--- a/scripts/load_phenotypes_to_db.py
+++ b/scripts/load_phenotypes_to_db.py
@@ -414,6 +414,7 @@ def update_means(
         xref_ids: tuple[int, ...]
 ):
     """Compute the means from the data and update them in the database."""
+    logger.info("Computing means for %02d phenotypes.", len(xref_ids))
     query = (
         "UPDATE PublishXRef SET mean = "
         "(SELECT AVG(value) FROM PublishData"
@@ -426,6 +427,7 @@ def update_means(
             batch = take(_xref_iterator, 10000)
             if len(batch) == 0:
                 break
+            logger.info("\tComputing means for batch of %02d phenotypes.", len(batch))
             cursor.executemany(
                 query,
                 tuple({