gn2/base/data_set/utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

"data_set package utilities"

import datetime
import os
import json
import hashlib
from typing import List


from gn2.utility.tools import get_setting, SQL_URI
from gn2.base.webqtlConfig import TMPDIR
from gn2.wqflask.database import parse_db_url, database_connection

def geno_mrna_confidentiality(ob):
    with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
        cursor.execute(
            "SELECT confidentiality, "
            f"AuthorisedUsers FROM {ob.type}Freeze WHERE Name = %s",
            (ob.name,)
        )
        result = cursor.fetchall()
        if len(result) > 0 and result[0]:
            return True

def query_table_timestamp(dataset_type: str):
    """function to query the update timestamp of a given dataset_type"""

    # computation data and actions
    with database_connection(get_setting("SQL_URI")) as conn, conn.cursor() as cursor:
        fetch_db_name = parse_db_url(SQL_URI)
        cursor.execute(
            "SELECT UPDATE_TIME FROM "
            "information_schema.tables "
            f"WHERE TABLE_SCHEMA = '{fetch_db_name[3]}' "
            f"AND TABLE_NAME = '{dataset_type}Data'")
        date_time_obj = cursor.fetchone()[0]
        if not date_time_obj:
            date_time_obj = datetime.datetime.now()
        return date_time_obj.strftime("%Y-%m-%d %H:%M:%S")


def generate_hash_file(dataset_name: str, dataset_type: str, dataset_timestamp: str, samplelist: str):
    """given the trait_name generate a unique name for this"""
    string_unicode = f"{dataset_name}{dataset_timestamp}{samplelist}".encode()
    md5hash = hashlib.md5(string_unicode)
    return md5hash.hexdigest()


def cache_dataset_results(dataset_name: str, dataset_type: str, samplelist: List, query_results: List):
    """function to cache dataset query results to file
    input dataset_name and type query_results(already processed in default dict format)
    """
    # data computations actions
    # store the file path on redis

    table_timestamp = query_table_timestamp(dataset_type)
    samplelist_as_str = ",".join(samplelist)

    file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp, samplelist_as_str)
    file_path = os.path.join(TMPDIR, f"{file_name}.json")

    with open(file_path, "w") as file_handler:
        json.dump(query_results, file_handler)


def fetch_cached_results(dataset_name: str, dataset_type: str, samplelist: List):
    """function to fetch the cached results"""

    table_timestamp = query_table_timestamp(dataset_type)
    samplelist_as_str = ",".join(samplelist)

    file_name = generate_hash_file(dataset_name, dataset_type, table_timestamp, samplelist_as_str)
    file_path = os.path.join(TMPDIR, f"{file_name}.json")
    try:
        with open(file_path, "r") as file_handler:

            return json.load(file_handler)

    except Exception:
        pass