From aa0f982145ef7a9ea453ffd02c1bbaea0f8021e7 Mon Sep 17 00:00:00 2001 From: John Nduli Date: Mon, 15 Jul 2024 14:57:02 +0300 Subject: refactor: clean up insert query --- scripts/update_rif_table.py | 82 +++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 43 deletions(-) (limited to 'scripts') diff --git a/scripts/update_rif_table.py b/scripts/update_rif_table.py index e7cfa08..5318020 100755 --- a/scripts/update_rif_table.py +++ b/scripts/update_rif_table.py @@ -30,79 +30,73 @@ Script responsible for updating the GenerRIF_BASIC table import argparse import csv -import os import gzip import pathlib -import sys from tempfile import TemporaryDirectory -from typing import Generator, List +from typing import Dict, Generator import requests -import MySQLdb - from gn3.db_utils import database_connection TAX_IDS = {"10090": 1, "9606": 4, "10116": 2, "3702": 3} -# os.chdir(path3) +GENE_INFO_URL = "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz" +GENERIFS_BASIC_URL = "https://ftp.ncbi.nih.gov/gene/GeneRIF/generifs_basic.gz" + +# TODO: Set this to a version that isn't already in use in the RIF database +VERSION_ID = 4 def download_file(url: str, dest: pathlib.Path): """Saves the contents of url in dest""" with requests.get(url, stream=True) as resp: resp.raise_for_status() - with open(dest, "wb") as f_: + with open(dest, "wb") as downloaded_file: for chunk in resp.iter_content(chunk_size=8192): - f_.write(chunk) + downloaded_file.write(chunk) -def read_tsv_file(fname: pathlib.Path) -> Generator[dict]: +def read_tsv_file(fname: pathlib.Path) -> Generator: + """Load tsv file from NCBI""" with gzip.open(fname, mode="rt") as gz_file: - reader = csv.DictReader(gz_file, delimiter="\t", quoting=csv.QUOTE_NOTE) + reader = csv.DictReader(gz_file, delimiter="\t", quoting=csv.QUOTE_NONE) yield from reader -def parse_gene_info_from_ncbi(fname: pathlib.Path): - genedict = {} +def parse_gene_info_from_ncbi(fname: pathlib.Path) -> Dict[str, str]: + """Parse gene_info into geneid: symbol pairs""" + genedict: Dict[str, str] = {} for row in read_tsv_file(fname): - if row["#task_id"] not in TAX_IDS: + if row["#tax_id"] not in TAX_IDS: continue - genedict[row["GeneID"]] = genedict[row["Symbol"]] + gene_id, symbol = row["GeneID"], row["Symbol"] + genedict[gene_id] = symbol return genedict def update_rif(sqluri: str): - """TODO: break this down into modules""" - with TemporaryDirectory() as tmpdir: + """Update GeneRIF_BASIC table""" + with TemporaryDirectory() as _tmpdir: + tmpdir = pathlib.Path(_tmpdir) gene_info_path = tmpdir / "gene_info.gz" - download_file( - "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz", gene_info_path - ) + download_file(GENE_INFO_URL, gene_info_path) - generif_basics_path = pathlib.Path("/tmp/generif_basics.gz") + generif_basics_path = tmpdir / "generif_basics.gz" download_file( - "https://ftp.ncbi.nih.gov/gene/GeneRIF/generifs_basic.gz", + GENERIFS_BASIC_URL, generif_basics_path, ) - genedict = parse_gene_info_from_ncbi(gene_info_path) - count_check_query = """ - SELECT COUNT(*) - FROM GeneRIF_BASIC - WHERE GeneRIF_BASIC.`SpeciesId`=%s - AND GeneRIF_BASIC.`GeneId`=%s - AND GeneRIF_BASIC.`PubMed_ID`=%s - AND GeneRIF_BASIC.`createtime`=%s - AND GeneRIF_BASIC.`comment`=%s - """ insert_query = """ - INSERT INTO GeneRIF_BASIC + INSERT IGNORE INTO GeneRIF_BASIC SET GeneRIF_BASIC.`SpeciesId`=%s, GeneRIF_BASIC.`GeneId`=%s, GeneRIF_BASIC.`symbol`=%s, GeneRIF_BASIC.`PubMed_ID`=%s, GeneRIF_BASIC.`createtime`=%s, - GeneRIF_BASIC.`comment`=%s + GeneRIF_BASIC.`comment`=%s, + GeneRIF_BASIC.`TaxID`=%s, + VersionId=%s """ with database_connection(sql_uri=sqluri) as con: @@ -110,20 +104,22 @@ def update_rif(sqluri: str): for row in read_tsv_file(generif_basics_path): if row["#Tax ID"] not in TAX_IDS: continue - row["#Tax ID"] = TAX_IDS[row["#Tax ID"]] + species_id = TAX_IDS[row["#Tax ID"]] symbol = genedict.get(row["Gene ID"], "") insert_values = ( - row["#Tax ID"], - symbol, - row["PubMed ID (PMID) list"], - row["last update timestamp"], - row["GeneRIF text"], + species_id, # SpeciesId + row["Gene ID"], # GeneId + symbol, # symbol + row["PubMed ID (PMID) list"], # PubMed_ID + row["last update timestamp"], # createtime + row["GeneRIF text"], # comment + row["#Tax ID"], # TaxID + VERSION_ID, # VersionId ) - cursor.execute(count_check_query, insert_values) - count = cursor.fetchone()[0] - if count != 0: - continue cursor.execute(insert_query, insert_values) + print( + f"Generif_BASIC table updated. In case of error, you can do use VersionID={VERSION_ID} to find rows inserted with this script" + ) if __name__ == "__main__": -- cgit v1.2.3