aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFrederick Muriuki Muriithi2025-05-19 10:16:52 -0500
committerFrederick Muriuki Muriithi2025-05-19 10:26:29 -0500
commite6adace2f9302a01b796176b7016feb9fae3d351 (patch)
tree7ed1420bf676a528cb42b4fba31311fa2eb22e06
parent65367196ae1fc0c7fd0fa004abefd0bd86d2684e (diff)
downloadgn-uploader-e6adace2f9302a01b796176b7016feb9fae3d351.tar.gz
Initialise function to save publications
Do a rudimentary save of the publications: this is incomplete and probably very buggy.
-rw-r--r--scripts/load_phenotypes_to_db.py41
1 files changed, 31 insertions, 10 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py
index 5c792f0..d48084e 100644
--- a/scripts/load_phenotypes_to_db.py
+++ b/scripts/load_phenotypes_to_db.py
@@ -1,21 +1,41 @@
import uuid
+import json
import logging
import argparse
+from typing import Union
from pathlib import Path
from MySQLdb.cursors import Cursor, DictCursor
from gn_libs import jobs, mysqldb, sqlite3
+from uploader.publications.models import create_new_publications
logging.basicConfig(
format="%(asctime)s — %(filename)s:%(lineno)s — %(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
-def save_publications(cursor: Cursor, pubfiles):
+def save_publications(
+ conn: mysqldb.Connection,
+ pubfiles,
+ standalone_publication_id=Optional[int] = None
+) -> tuple[dict[str, Union[int, str]]]:
"""Read the `pubfiles` and save the publications therein."""
+ _publications = tuple()
+ if standalone_publication_id:
+ # HACK: This is a hack. Remove once we update bundle creation to include
+ # publication(s) in the bundle
+ _publications = _publications + (
+ fetch_publication_by_id(conn, standalone_publication_id),)
+ # -> check whether the publication-id exists?
+ # -> perhaps setup the bundle with the appropriate publications
+ # -> gn-data -> (phenotypes-se, phenotypes-n)
+ # -> gn-metadata -> (pheno, geno)
+ if len(pubfiles) > 0:# TODO: check for presence of data — improve this check.
+ logger.info("Saving publications.")
+ _publications = _publication + create_new_publications(conn, pubs)
# Check for PubMed IDs, perhaps?
- pass
+ return _publications
def save_phenotypes(cursor: Cursor, phenofiles):
@@ -46,6 +66,7 @@ def cross_reference_phenotypes_publications_and_data(
def load_data(conn, job):
"""Load the data attached in the given job."""
+ _job_metadata = json.loads(job["job-metadata"])
with conn.cursor(cursorclass=DictCursor) as cursor:
# Steps
# 0. Read data from the files: can be multiple files per type
@@ -54,7 +75,8 @@ def load_data(conn, job):
# -> return phenotype IDs
_control_data = rqtl.control_data(job["job-metadata"]["bundle-file"])
logger.info("Saving basic phenotype data.")
- _phenos = save_phenotypes(cursor, _control_data["pheno"])
+
+ _phenos = save_phenotypes(cursor, _control_data)
_next_data_id = fetch_next_dataid(...)
dataidmap = {
row["phenotype_id"]: {
@@ -65,13 +87,12 @@ def load_data(conn, job):
}
# 2. Save any new publications (in multi-file bundle):
# -> return publication IDS
- logger.info("Saving publications.")
- # -> check whether the publication-id exists?
- # -> perhaps setup the bundle with the appropriate publications
- # -> gn-data -> (phenotypes-se, phenotypes-n)
- # -> gn-metadata -> (pheno, geno)
- publication = save_publications(
- cursor, _control_data.get("metadata", {}).get("publications"))
+ publications = publications + save_publications(
+ cursor,
+ _control_data.get(
+ "metadata", {}).get(
+ "publications"),
+ _job_metadata.get("publicationid"))
_pubidmap = {
# TODO: Map the pheno ids to the publication ids
}