about summary refs log tree commit diff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2025-08-25 10:04:13 -0500
committerFrederick Muriuki Muriithi2025-08-26 10:06:10 -0500
commit819e5b3cf86607aebc97f266760025fcc739ff5a (patch)
tree8493507bf16bdda4d47f97c380e1d1319c685503
parent87186314c2431381390595063487eb6a2718a1e4 (diff)
downloadgn-uploader-819e5b3cf86607aebc97f266760025fcc739ff5a.tar.gz
Update script to use newer form of `create_new_phenotypes` function.
-rw-r--r--scripts/load_phenotypes_to_db.py83
1 files changed, 29 insertions, 54 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py
index c1a7687..3737f2d 100644
--- a/scripts/load_phenotypes_to_db.py
+++ b/scripts/load_phenotypes_to_db.py
@@ -40,8 +40,10 @@ def __replace_na_strings__(line, na_strings):
 
 
 def save_phenotypes(
-        cursor: mysqldb.Connection,
+        conn: mysqldb.Connection,
         control_data: dict[str, Any],
+        population_id,
+        publication_id,
         filesdir: Path
 ) -> tuple[dict, ...]:
     """Read `phenofiles` and save the phenotypes therein."""
@@ -63,7 +65,9 @@ def save_phenotypes(
                                            control_data["sep"],
                                            control_data["comment.char"])
     return create_new_phenotypes(
-        cursor,
+        conn,
+        population_id,
+        publication_id,
         (dict(zip(_headers,
                   __replace_na_strings__(line, control_data["na.strings"])))
          for filecontent
@@ -75,14 +79,6 @@ def save_phenotypes(
          if idx != 0))
 
 
-def __fetch_next_dataid__(conn: mysqldb.Connection) -> int:
-    """Fetch the next available DataId value from the database."""
-    with conn.cursor(cursorclass=DictCursor) as cursor:
-        cursor.execute(
-            "SELECT MAX(DataId) AS CurrentMaxDataId FROM PublishXRef")
-        return int(cursor.fetchone()["CurrentMaxDataId"]) + 1
-
-
 def __row_to_dataitems__(
         sample_row: dict,
         dataidmap: dict,
@@ -199,34 +195,6 @@ save_phenotypes_n = partial(save_numeric_data,
                              table="NStrain")
 
 
-def cross_reference_phenotypes_publications_and_data(
-        conn: mysqldb.Connection, xref_data: tuple[dict, ...]
-):
-    """Crossreference the phenotypes, publication and data."""
-    with conn.cursor(cursorclass=DictCursor) as cursor:
-        cursor.execute("SELECT MAX(Id) CurrentMaxId FROM PublishXRef")
-        _nextid = int(cursor.fetchone()["CurrentMaxId"]) + 1
-        _params = tuple({**row, "xref_id": _id}
-                        for _id, row in enumerate(xref_data, start=_nextid))
-        cursor.executemany(
-            ("INSERT INTO PublishXRef("
-             "Id, InbredSetId, PhenotypeId, PublicationId, DataId, comments"
-             ") "
-             "VALUES ("
-             "%(xref_id)s, %(population_id)s, %(phenotype_id)s, "
-             "%(publication_id)s, %(data_id)s, 'Upload of new data.'"
-             ")"),
-            _params)
-        cursor.executemany(
-            "UPDATE PublishXRef SET mean="
-            "(SELECT AVG(value) FROM PublishData WHERE PublishData.Id=PublishXRef.DataId) "
-            "WHERE PublishXRef.Id=%(xref_id)s AND "
-            "InbredSetId=%(population_id)s",
-            _params)
-        return _params
-    return tuple()
-
-
 def update_auth(# pylint: disable=[too-many-locals,too-many-positional-arguments,too-many-arguments]
         authserver,
         token,
@@ -369,25 +337,35 @@ def load_data(conn: mysqldb.Connection, job: dict) -> int:#pylint: disable=[too-
     with ZipFile(str(bundle), "r") as zfile:
         _files = rqtl2.extract(zfile, _outdir)
     logger.info("Saving new phenotypes.")
-    _phenos = save_phenotypes(conn, _control_data, _outdir)
-    def __build_phenos_maps__(accumulator, current):
-        dataid, row = current
+    _phenos = save_phenotypes(conn,
+                              _control_data,
+                              _population["Id"],
+                              _publication["Id"],
+                              _outdir)
+
+    def __build_phenos_maps__(accumulator, row):
         return ({
             **accumulator[0],
             row["phenotype_id"]: {
                 "population_id": _population["Id"],
                 "phenotype_id": row["phenotype_id"],
-                "data_id": dataid,
-                "publication_id": _publication["Id"],
+                "data_id": row["data_id"],
+                "publication_id": row["publication_id"],
             }
         }, {
             **accumulator[1],
-            row["id"]: row["phenotype_id"]
-        })
-    dataidmap, pheno_name2id = reduce(
-        __build_phenos_maps__,
-        enumerate(_phenos, start=__fetch_next_dataid__(conn)),
-        ({},{}))
+            row["pre_publication_abbreviation"]: row["phenotype_id"]
+        }, (
+            accumulator[2] + ({
+                "xref_id": row["xref_id"],
+                "population_id": row["population_id"],
+                "phenotype_id": row["phenotype_id"],
+                "publication_id": row["publication_id"],
+                "data_id": row["data_id"]
+            },)))
+    dataidmap, pheno_name2id, _xrefs = reduce(__build_phenos_maps__,
+                                      _phenos,
+                                      ({},{}, tuple()))
     # 3. a. Fetch the strain names and IDS: create name->ID map
     samples = {
         row["Name"]: row
@@ -402,11 +380,8 @@ def load_data(conn: mysqldb.Connection, job: dict) -> int:#pylint: disable=[too-
                                      control_data=_control_data,
                                      filesdir=_outdir)
     logger.info("Saved %s new phenotype data rows.", _num_data_rows)
-    # 4. Cross-reference Phenotype, Publication, and PublishData in PublishXRef
-    logger.info("Cross-referencing new phenotypes to their data and publications.")
-    _xrefs = cross_reference_phenotypes_publications_and_data(
-        conn, tuple(dataidmap.values()))
-    # 5. If standard errors and N exist, save them too
+
+    # 4. If standard errors and N exist, save them too
     #    (use IDs returned in `3. b.` above).
     if _control_data.get("phenose"):
         logger.info("Saving new phenotypes standard errors.")