about summary refs log tree commit diff
diff options
context:
space:
mode:
authorJohn Nduli2024-07-19 22:31:07 +0300
committerBonfaceKilz2024-07-23 10:18:32 +0300
commit4fd8f3d707283a52b0a53d9eb180c732d387a609 (patch)
tree1ae25b064bc56630086259d8fac7d0e1e9a41c41
parent39449d36a8b4332ef23cb1631dbda34d76c7f377 (diff)
downloadgenenetwork3-4fd8f3d707283a52b0a53d9eb180c732d387a609.tar.gz
chore: mypy and pylint fixes
-rwxr-xr-xscripts/update_rif_table.py62
1 files changed, 40 insertions, 22 deletions
diff --git a/scripts/update_rif_table.py b/scripts/update_rif_table.py
index a092967..e40774a 100755
--- a/scripts/update_rif_table.py
+++ b/scripts/update_rif_table.py
@@ -1,16 +1,18 @@
+#!/usr/bin/env python3
+
 """
 Script responsible for updating the GeneRIF_BASIC table
 """
 
 import argparse
 import csv
+import datetime
 import gzip
 import pathlib
 from tempfile import TemporaryDirectory
 from typing import Dict, Generator
-import requests
-import datetime
 
+import requests
 from MySQLdb.cursors import DictCursor
 
 from gn3.db_utils import database_connection
@@ -23,6 +25,12 @@ GENERIFS_BASIC_URL = "https://ftp.ncbi.nih.gov/gene/GeneRIF/generifs_basic.gz"
 VERSION_ID = 5
 
 
+INSERT_QUERY = """ INSERT INTO GeneRIF_BASIC
+(SpeciesId, GeneId, symbol, PubMed_Id, createtime, comment, TaxID, VersionId)
+VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
+"""
+
+
 def download_file(url: str, dest: pathlib.Path):
     """Saves the contents of url in dest"""
     with requests.get(url, stream=True) as resp:
@@ -51,8 +59,15 @@ def parse_gene_info_from_ncbi(fname: pathlib.Path) -> Dict[str, str]:
 
 
 def build_already_exists_cache(conn) -> dict:
+    """
+    Build cache for all GeneId, SpeciesID, createtime, PubMed_ID combinations.
+    Helps prevent duplicate inserts.
+    """
     cache = {}
-    query = "SELECT COUNT(*) as cnt, GeneId, SpeciesId, createtime, PubMed_ID from GeneRIF_BASIC GROUP BY GeneId, SpeciesId, createtime, PubMed_Id"
+    query = """SELECT
+        COUNT(*) as cnt, GeneId, SpeciesId, createtime, PubMed_ID
+        from GeneRIF_BASIC
+        GROUP BY GeneId, SpeciesId, createtime, PubMed_Id """
 
     with conn.cursor(DictCursor) as cursor:
         cursor.execute(query)
@@ -67,6 +82,22 @@ def build_already_exists_cache(conn) -> dict:
     return cache
 
 
+def should_add_rif_row(row: dict, exists_cache: dict) -> bool:
+    """Checks if we can add a rif_row, prevent duplicate errors from Mysql"""
+    species_id = str(TAX_IDS[row["#Tax ID"]])
+    insert_date = datetime.datetime.fromisoformat(row["last update timestamp"])
+    search_key = (
+        row["Gene ID"],
+        species_id,
+        insert_date,
+        row["PubMed ID (PMID) list"],
+    )
+    if search_key not in exists_cache:
+        exists_cache[search_key] = 1
+        return True
+    return False
+
+
 def update_rif(sqluri: str):
     """Update GeneRIF_BASIC table"""
     with TemporaryDirectory() as _tmpdir:
@@ -81,10 +112,6 @@ def update_rif(sqluri: str):
         )
 
         genedict = parse_gene_info_from_ncbi(gene_info_path)
-        insert_query = """ INSERT INTO GeneRIF_BASIC
-        (SpeciesId, GeneId, symbol, PubMed_Id, createtime, comment, TaxID, VersionId)
-        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
-        """
         with database_connection(sql_uri=sqluri) as con:
             exists_cache = build_already_exists_cache(con)
             cursor = con.cursor()
@@ -92,16 +119,10 @@ def update_rif(sqluri: str):
             for row in read_tsv_file(generif_basics_path):
                 if row["#Tax ID"] not in TAX_IDS:
                     continue
+                if not should_add_rif_row(row, exists_cache):
+                    skipped_if_exists += 1
+                    continue
                 species_id = TAX_IDS[row["#Tax ID"]]
-                insert_date = datetime.datetime.fromisoformat(
-                    row["last update timestamp"]
-                )
-                search_key = (
-                    row["Gene ID"],
-                    str(species_id),
-                    insert_date,
-                    row["PubMed ID (PMID) list"],
-                )
                 symbol = genedict.get(row["Gene ID"], "")
                 insert_values = (
                     species_id,  # SpeciesId
@@ -113,16 +134,13 @@ def update_rif(sqluri: str):
                     row["#Tax ID"],  # TaxID
                     VERSION_ID,  # VersionId
                 )
-                if search_key in exists_cache:
-                    skipped_if_exists += 1
-                    continue
-                exists_cache[search_key] = 1
-                cursor.execute(insert_query, insert_values)
+                cursor.execute(INSERT_QUERY, insert_values)
                 added += 1
         print(
             f"Generif_BASIC table updated. Added {added}. "
             f"Skipped {skipped_if_exists} because they already exists. "
-            f"In case of error, you can use VersionID={VERSION_ID} to find rows inserted with this script"
+            f"In case of error, you can use VersionID={VERSION_ID} to find "
+            "rows inserted with this script"
         )