# Copyright (C) University of Tennessee Health Science Center, Memphis, TN.
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Affero General Public License
# as published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU Affero General Public License for more details.
#
# This program is available from Source Forge: at GeneNetwork Project
# (sourceforge.net/projects/genenetwork/).
#
# Contact Drs. Robert W. Williams and Xiaodong Zhou (2010)
# at rwilliams@uthsc.edu and xzhou15@uthsc.edu
#
#
#
# This module is used by GeneNetwork project (www.genenetwork.org)
#
# Created by GeneNetwork Core Team 2010/08/10
# Updated on Lei Yan 2011/02/08
# created by Lei Yan 02/08/2011

"""
Script responsible for updating the GenerRIF_BASIC table
"""

import argparse
import csv
import gzip
import pathlib
from tempfile import TemporaryDirectory
from typing import Dict, Generator
import requests

from gn3.db_utils import database_connection

TAX_IDS = {"10090": 1, "9606": 4, "10116": 2, "3702": 3}

GENE_INFO_URL = "https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz"
GENERIFS_BASIC_URL = "https://ftp.ncbi.nih.gov/gene/GeneRIF/generifs_basic.gz"

# TODO: Set this to a version that isn't already in use in the RIF database
VERSION_ID = 4


def download_file(url: str, dest: pathlib.Path):
    """Saves the contents of url in dest"""
    with requests.get(url, stream=True) as resp:
        resp.raise_for_status()
        with open(dest, "wb") as downloaded_file:
            for chunk in resp.iter_content(chunk_size=8192):
                downloaded_file.write(chunk)


def read_tsv_file(fname: pathlib.Path) -> Generator:
    """Load tsv file from NCBI"""
    with gzip.open(fname, mode="rt") as gz_file:
        reader = csv.DictReader(gz_file, delimiter="\t", quoting=csv.QUOTE_NONE)
        yield from reader


def parse_gene_info_from_ncbi(fname: pathlib.Path) -> Dict[str, str]:
    """Parse gene_info into geneid: symbol pairs"""
    genedict: Dict[str, str] = {}
    for row in read_tsv_file(fname):
        if row["#tax_id"] not in TAX_IDS:
            continue
        gene_id, symbol = row["GeneID"], row["Symbol"]
        genedict[gene_id] = symbol
    return genedict


def update_rif(sqluri: str):
    """Update GeneRIF_BASIC table"""
    with TemporaryDirectory() as _tmpdir:
        tmpdir = pathlib.Path(_tmpdir)
        gene_info_path = tmpdir / "gene_info.gz"
        download_file(GENE_INFO_URL, gene_info_path)

        generif_basics_path = tmpdir / "generif_basics.gz"
        download_file(
            GENERIFS_BASIC_URL,
            generif_basics_path,
        )
        genedict = parse_gene_info_from_ncbi(gene_info_path)
        insert_query = """
                                INSERT IGNORE INTO GeneRIF_BASIC
                                SET GeneRIF_BASIC.`SpeciesId`=%s,
                                        GeneRIF_BASIC.`GeneId`=%s,
                                        GeneRIF_BASIC.`symbol`=%s,
                                        GeneRIF_BASIC.`PubMed_ID`=%s,
                                        GeneRIF_BASIC.`createtime`=%s,
                                        GeneRIF_BASIC.`comment`=%s,
                                        GeneRIF_BASIC.`TaxID`=%s,
                                        VersionId=%s
                                """

        with database_connection(sql_uri=sqluri) as con:
            with con.cursor() as cursor:
                for row in read_tsv_file(generif_basics_path):
                    if row["#Tax ID"] not in TAX_IDS:
                        continue
                    species_id = TAX_IDS[row["#Tax ID"]]
                    symbol = genedict.get(row["Gene ID"], "")
                    insert_values = (
                        species_id,  # SpeciesId
                        row["Gene ID"],  # GeneId
                        symbol,  # symbol
                        row["PubMed ID (PMID) list"],  # PubMed_ID
                        row["last update timestamp"],  # createtime
                        row["GeneRIF text"],  # comment
                        row["#Tax ID"],  # TaxID
                        VERSION_ID,  # VersionId
                    )
                    cursor.execute(insert_query, insert_values)
        print(
            f"Generif_BASIC table updated. In case of error, you can do use VersionID={VERSION_ID} to find rows inserted with this script"
        )


if __name__ == "__main__":
    parser = argparse.ArgumentParser("Update Generif_BASIC table")
    parser.add_argument(
        "--sql-uri",
        required=True,
        help="MYSQL uri path in the form mysql://user:password@localhost/gn2",
    )
    args = parser.parse_args()
    update_rif(args.sql_uri)