diff options
-rw-r--r-- | scripts/load_phenotypes_to_db.py | 41 |
1 files changed, 31 insertions, 10 deletions
diff --git a/scripts/load_phenotypes_to_db.py b/scripts/load_phenotypes_to_db.py index 5c792f0..d48084e 100644 --- a/scripts/load_phenotypes_to_db.py +++ b/scripts/load_phenotypes_to_db.py @@ -1,21 +1,41 @@ import uuid +import json import logging import argparse +from typing import Union from pathlib import Path from MySQLdb.cursors import Cursor, DictCursor from gn_libs import jobs, mysqldb, sqlite3 +from uploader.publications.models import create_new_publications logging.basicConfig( format="%(asctime)s — %(filename)s:%(lineno)s — %(levelname)s: %(message)s") logger = logging.getLogger(__name__) -def save_publications(cursor: Cursor, pubfiles): +def save_publications( + conn: mysqldb.Connection, + pubfiles, + standalone_publication_id=Optional[int] = None +) -> tuple[dict[str, Union[int, str]]]: """Read the `pubfiles` and save the publications therein.""" + _publications = tuple() + if standalone_publication_id: + # HACK: This is a hack. Remove once we update bundle creation to include + # publication(s) in the bundle + _publications = _publications + ( + fetch_publication_by_id(conn, standalone_publication_id),) + # -> check whether the publication-id exists? + # -> perhaps setup the bundle with the appropriate publications + # -> gn-data -> (phenotypes-se, phenotypes-n) + # -> gn-metadata -> (pheno, geno) + if len(pubfiles) > 0:# TODO: check for presence of data — improve this check. + logger.info("Saving publications.") + _publications = _publication + create_new_publications(conn, pubs) # Check for PubMed IDs, perhaps? - pass + return _publications def save_phenotypes(cursor: Cursor, phenofiles): @@ -46,6 +66,7 @@ def cross_reference_phenotypes_publications_and_data( def load_data(conn, job): """Load the data attached in the given job.""" + _job_metadata = json.loads(job["job-metadata"]) with conn.cursor(cursorclass=DictCursor) as cursor: # Steps # 0. Read data from the files: can be multiple files per type @@ -54,7 +75,8 @@ def load_data(conn, job): # -> return phenotype IDs _control_data = rqtl.control_data(job["job-metadata"]["bundle-file"]) logger.info("Saving basic phenotype data.") - _phenos = save_phenotypes(cursor, _control_data["pheno"]) + + _phenos = save_phenotypes(cursor, _control_data) _next_data_id = fetch_next_dataid(...) dataidmap = { row["phenotype_id"]: { @@ -65,13 +87,12 @@ def load_data(conn, job): } # 2. Save any new publications (in multi-file bundle): # -> return publication IDS - logger.info("Saving publications.") - # -> check whether the publication-id exists? - # -> perhaps setup the bundle with the appropriate publications - # -> gn-data -> (phenotypes-se, phenotypes-n) - # -> gn-metadata -> (pheno, geno) - publication = save_publications( - cursor, _control_data.get("metadata", {}).get("publications")) + publications = publications + save_publications( + cursor, + _control_data.get( + "metadata", {}).get( + "publications"), + _job_metadata.get("publicationid")) _pubidmap = { # TODO: Map the pheno ids to the publication ids } |