gn3/llms/process.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

"""this module contains code for processing response from fahamu client.py"""
# pylint: disable=C0301
import os
import string
import json
import logging
from urllib.parse import quote

from gn3.llms.client import GeneNetworkQAClient


BASE_URL = 'https://genenetwork.fahamuai.com/api/tasks'
BASEDIR = os.path.abspath(os.path.dirname(__file__))


class DocIDs():
    """ Class Method to Parse document id and names from files"""
    def __init__(self):
        """
        init method for Docids
        * doc_ids.json: opens doc)ids for gn references
        * sugar_doc_ids:  open doci_ids for diabetes references
        """
        self.doc_ids = self.load_file("doc_ids.json")
        self.sugar_doc_ids = self.load_file("all_files.json")
        self.format_doc_ids(self.sugar_doc_ids)

    def load_file(self, file_name):
        """Method to load and read doc_id files"""
        file_path = os.path.join(BASEDIR, file_name)
        if os.path.isfile(file_path):
            with open(file_path, "rb") as file_handler:
                return json.load(file_handler)
        else:
            raise FileNotFoundError(f"{file_path}-- FIle does not exist\n")

    def format_doc_ids(self, docs):
        """method to format doc_ids for list items"""
        for _key, val in docs.items():
            if isinstance(val, list):
                for doc_obj in val:
                    doc_name = doc_obj["filename"].removesuffix(".pdf").removesuffix(".txt").replace("_", "")
                    self.doc_ids.update({doc_obj["id"]:  doc_name})

    def get_info(self, doc_id):
        """ interface to make read from doc_ids"""
        if doc_id in self.doc_ids.keys():
            return self.doc_ids[doc_id]
        else:
            return doc_id


def format_bibliography_info(bib_info):
    """Function for formatting bibliography info"""
    if isinstance(bib_info, str):
        return bib_info.removesuffix('.txt')
    elif isinstance(bib_info, dict):
        return f"{bib_info['author']}.{bib_info['title']}.{bib_info['year']}.{bib_info['doi']} "
    return bib_info


def filter_response_text(val):
    """helper function for filtering non-printable chars"""
    return json.loads(''.join([str(char)
                               for char in val if char in string.printable]))


def parse_context(context, get_info_func, format_bib_func):
    """function to parse doc_ids content"""
    results = []
    for doc_ids, summary in context.items():
        combo_txt = ""
        for entry in summary:
            combo_txt += "\t" + entry["text"]
        doc_info = get_info_func(doc_ids)
        bib_info = doc_ids if doc_ids == doc_info else format_bib_func(
            doc_info)
        results.append(
            {"doc_id": doc_ids, "bibInfo": bib_info, "comboTxt": combo_txt})
    return results


def load_file(filename, dir_path):
    """function to open and load json file"""
    file_path = os.path.join(dir_path, f"{filename}")
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"{filename} was not found or is a directory")
    with open(file_path, "rb") as file_handler:
        return json.load(file_handler)


def fetch_pubmed(references, file_name, data_dir=""):
    """method to fetch and populate references with pubmed"""

    try:
        pubmed = load_file(file_name, os.path.join(data_dir, "gn-meta/lit"))
        for reference in references:
            if pubmed.get(reference["doc_id"]):
                reference["pubmed"] = pubmed.get(reference["doc_id"])
        return references

    except FileNotFoundError:
        logging.error("failed to find pubmed_path for %s/%s",
                      data_dir, file_name)
        return references


def get_gnqa(query, auth_token, data_dir=""):
    """entry function for the gn3 api endpoint()
    ARGS:
         query: what is  a gene
         auth_token: token to connect to api_client
         data_dir:  base datirectory for gn3 data
    Returns:
         task_id: fahamu unique identifier for task
         answer
         references: contains doc_name,reference,pub_med_info
    """
    api_client = GeneNetworkQAClient(api_key=auth_token)
    res, task_id = api_client.ask('?ask=' + quote(query), query=query)
    res, _status = api_client.get_answer(task_id)
    resp_text = filter_response_text(res.text)
    answer = resp_text['data']['answer']
    context = resp_text['data']['context']
    return task_id, answer, fetch_pubmed(parse_context(
        context, DocIDs().get_info, format_bibliography_info), "pubmed.json", data_dir)