aboutsummaryrefslogtreecommitdiff
path: root/scripts
diff options
context:
space:
mode:
authorJohn Nduli2024-07-12 16:06:17 +0300
committerBonfaceKilz2024-07-23 10:18:32 +0300
commit8c63ec50597656fe5dc51b0cc749254e71029935 (patch)
tree867da0e9244c826b6642356e3134881b75b9af6d /scripts
parent3bdda52511277ee8d9c09ea6332512256f408ad7 (diff)
downloadgenenetwork3-8c63ec50597656fe5dc51b0cc749254e71029935.tar.gz
refactor: reorganize update_rif script to be more pythonic
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/update_rif_table.py179
1 files changed, 93 insertions, 86 deletions
diff --git a/scripts/update_rif_table.py b/scripts/update_rif_table.py
index 8566d60..e7cfa08 100755
--- a/scripts/update_rif_table.py
+++ b/scripts/update_rif_table.py
@@ -28,103 +28,110 @@
Script responsible for updating the GenerRIF_BASIC table
"""
+import argparse
+import csv
import os
+import gzip
+import pathlib
import sys
+from tempfile import TemporaryDirectory
+from typing import Generator, List
+import requests
+
import MySQLdb
-path1 = os.path.abspath(os.path.dirname(__file__))
-path2 = path1 + "/.."
-path3 = path1 + "/../../tmp"
-sys.path.insert(0, path2)
+from gn3.db_utils import database_connection
+
+TAX_IDS = {"10090": 1, "9606": 4, "10116": 2, "3702": 3}
+
+# os.chdir(path3)
+
+
+def download_file(url: str, dest: pathlib.Path):
+ """Saves the contents of url in dest"""
+ with requests.get(url, stream=True) as resp:
+ resp.raise_for_status()
+ with open(dest, "wb") as f_:
+ for chunk in resp.iter_content(chunk_size=8192):
+ f_.write(chunk)
-def fetchrif():
- """ TODO: break this down into modules """
- try:
- con = MySQLdb.Connect(db="gn3", host="localhost", user="gn2", passwd="password")
- cursor = con.cursor()
- print("You have successfully connected to mysql.\n")
- except:
- print("You entered incorrect password.\n")
- sys.exit(0)
+def read_tsv_file(fname: pathlib.Path) -> Generator[dict]:
+ with gzip.open(fname, mode="rt") as gz_file:
+ reader = csv.DictReader(gz_file, delimiter="\t", quoting=csv.QUOTE_NOTE)
+ yield from reader
- tax_ids = {"10090": 1, "9606": 4, "10116": 2, "3702": 3}
- tax_id_keys = tax_ids.keys()
- os.chdir(path3)
- print(f"path3: {path3}")
+def parse_gene_info_from_ncbi(fname: pathlib.Path):
genedict = {}
+ for row in read_tsv_file(fname):
+ if row["#task_id"] not in TAX_IDS:
+ continue
+ genedict[row["GeneID"]] = genedict[row["Symbol"]]
+ return genedict
- os.system("rm -vf gene_info")
- os.system("wget ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz")
- os.system("gunzip gene_info.gz")
-
- with open("gene_info", "r") as file:
- i = 0
- for line1 in file:
- line1 = line1.strip()
- if line1.startswith("#"):
- continue
- line2 = line1.strip().split("\t")
- if line2[0] in tax_id_keys:
- genedict[line2[1]] = line2[2]
- i += 1
- if i % 1000000 == 0:
- print(f"finished: {i}")
- print(f"finished all: {i}")
-
- os.system("rm -vf generifs_basic")
- os.system("wget ftp://ftp.ncbi.nlm.nih.gov/gene/GeneRIF/generifs_basic.gz")
- os.system("gunzip generifs_basic.gz")
-
- with open("generifs_basic", "r") as file:
- i = 0
- for line1 in file:
- line1 = line1.strip()
- if line1.startswith("#"):
- continue
- line2 = line1.strip().split("\t")
- if line2[0] in tax_id_keys and len(line2) >= 5:
- line2[0] = tax_ids[line2[0]]
- try:
- symbol = genedict[line2[1]]
- except:
- symbol = ""
- sql = """
- SELECT COUNT(*)
- FROM GeneRIF_BASIC
- WHERE GeneRIF_BASIC.`SpeciesId`=%s
- AND GeneRIF_BASIC.`GeneId`=%s
- AND GeneRIF_BASIC.`PubMed_ID`=%s
- AND GeneRIF_BASIC.`createtime`=%s
- AND GeneRIF_BASIC.`comment`=%s
- """
- cursor.execute(sql, (line2[0], line2[1], line2[2], line2[3], line2[4]))
- count = cursor.fetchone()[0]
- if count == 0:
- print("to insert...")
- sql = """
- INSERT INTO GeneRIF_BASIC
- SET GeneRIF_BASIC.`SpeciesId`=%s,
- GeneRIF_BASIC.`GeneId`=%s,
- GeneRIF_BASIC.`symbol`=%s,
- GeneRIF_BASIC.`PubMed_ID`=%s,
- GeneRIF_BASIC.`createtime`=%s,
- GeneRIF_BASIC.`comment`=%s
- """
- cursor.execute(
- sql, (line2[0], line2[1], symbol, line2[2], line2[3], line2[4])
- )
- i += 1
- if i % 100000 == 0:
- print(f"finished: {i}")
- print(f"finished all: {i}")
- cursor.close()
+def update_rif(sqluri: str):
+ """TODO: break this down into modules"""
+ with TemporaryDirectory() as tmpdir:
+ gene_info_path = tmpdir / "gene_info.gz"
+ download_file(
+ "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz", gene_info_path
+ )
+
+ generif_basics_path = pathlib.Path("/tmp/generif_basics.gz")
+ download_file(
+ "https://ftp.ncbi.nih.gov/gene/GeneRIF/generifs_basic.gz",
+ generif_basics_path,
+ )
+
+ genedict = parse_gene_info_from_ncbi(gene_info_path)
+ count_check_query = """
+ SELECT COUNT(*)
+ FROM GeneRIF_BASIC
+ WHERE GeneRIF_BASIC.`SpeciesId`=%s
+ AND GeneRIF_BASIC.`GeneId`=%s
+ AND GeneRIF_BASIC.`PubMed_ID`=%s
+ AND GeneRIF_BASIC.`createtime`=%s
+ AND GeneRIF_BASIC.`comment`=%s
+ """
+ insert_query = """
+ INSERT INTO GeneRIF_BASIC
+ SET GeneRIF_BASIC.`SpeciesId`=%s,
+ GeneRIF_BASIC.`GeneId`=%s,
+ GeneRIF_BASIC.`symbol`=%s,
+ GeneRIF_BASIC.`PubMed_ID`=%s,
+ GeneRIF_BASIC.`createtime`=%s,
+ GeneRIF_BASIC.`comment`=%s
+ """
+
+ with database_connection(sql_uri=sqluri) as con:
+ with con.cursor() as cursor:
+ for row in read_tsv_file(generif_basics_path):
+ if row["#Tax ID"] not in TAX_IDS:
+ continue
+ row["#Tax ID"] = TAX_IDS[row["#Tax ID"]]
+ symbol = genedict.get(row["Gene ID"], "")
+ insert_values = (
+ row["#Tax ID"],
+ symbol,
+ row["PubMed ID (PMID) list"],
+ row["last update timestamp"],
+ row["GeneRIF text"],
+ )
+ cursor.execute(count_check_query, insert_values)
+ count = cursor.fetchone()[0]
+ if count != 0:
+ continue
+ cursor.execute(insert_query, insert_values)
-# /usr/bin/python addRif.py
if __name__ == "__main__":
- print(f"command line arguments:\n\t{sys.argv}")
- fetchrif()
- print("exit successfully")
+ parser = argparse.ArgumentParser("Update Generif_BASIC table")
+ parser.add_argument(
+ "--sql-uri",
+ required=True,
+ help="MYSQL uri path in the form mysql://user:password@localhost/gn2",
+ )
+ args = parser.parse_args()
+ update_rif(args.sql_uri)