gn3/llms/process.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

"""this module contains code for processing response from fahamu client.py"""
# pylint: disable=C0301
import os
import string
import json
import logging
from urllib.parse import quote

from gn3.llms.client import GeneNetworkQAClient


BASE_URL = 'https://genenetwork.fahamuai.com/api/tasks'
BASEDIR = os.path.abspath(os.path.dirname(__file__))


class DocIDs():
    """ Class Method to Parse document id and names from files"""
    def __init__(self):
        """
        init method for Docids
        * doc_ids.json: opens doc)ids for gn references
        * sugar_doc_ids:  open doci_ids for diabetes references
        """
        self.doc_ids = load_file("doc_ids.json", BASEDIR)
        self.sugar_doc_ids = load_file("all_files.json", BASEDIR)
        self.format_doc_ids(self.sugar_doc_ids)

    def format_doc_ids(self, docs):
        """method to format doc_ids for list items doc_id and doc_name"""
        for _key, val in docs.items():
            if isinstance(val, list):
                for doc_obj in val:
                    doc_name = doc_obj["filename"].removesuffix(".pdf").removesuffix(".txt").replace("_", "")
                    self.doc_ids.update({doc_obj["id"]:  doc_name})

    def get_info(self, doc_id):
        """ interface to make read from doc_ids
           and extract info data  else returns
           doc_id
        Args:
            doc_id: str: a search key for doc_ids
        Returns:
              an object with doc_info if doc_id in doc_ids
        """
        if doc_id in self.doc_ids.keys():
            return self.doc_ids[doc_id]
        else:
            return doc_id


def format_bibliography_info(bib_info):
    """Utility function for formatting bibliography info
    """
    if isinstance(bib_info, str):
        return bib_info.removesuffix('.txt')
    elif isinstance(bib_info, dict):
        return f"{bib_info['author']}.{bib_info['title']}.{bib_info['year']}.{bib_info['doi']} "
    return bib_info


def parse_context(context, get_info_func, format_bib_func):
    """Function to parse doc_ids content
     Args:
         context: raw references from  fahamu api
         get_info_func: function to get doc_ids info
         format_bib_func:  function to foramt bibliography info
    Returns:
          an list with each item having (doc_id,bib_info,
          combined reference text)
    """
    results = []
    for doc_ids, summary in context.items():
        combo_txt = ""
        for entry in summary:
            combo_txt += "\t" + entry["text"]
        doc_info = get_info_func(doc_ids)
        bib_info = doc_ids if doc_ids == doc_info else format_bib_func(
            doc_info)
        results.append(
            {"doc_id": doc_ids, "bibInfo": bib_info, "comboTxt": combo_txt})
    return results


def load_file(filename, dir_path):
    """Utility function to read json file
    Args:
        filename:  file name to read
        dir_path:  base directory for the file
    Returns: json data read to a dict
    """
    file_path = os.path.join(dir_path, f"{filename}")
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"{filename} was not found or is a directory")
    with open(file_path, "rb") as file_handler:
        return json.load(file_handler)


def fetch_pubmed(references, file_name, data_dir=""):
    """
    Fetches PubMed data from a JSON file and populates the\
    references dictionary.

    Args:
        references (dict): Dictionary with document IDs as keys\
    and reference data as values.
        filename (str): Name of the JSON file containing PubMed data.
        data_dir (str): Base directory where the data files are located.

    Returns:
        dict: Updated references dictionary populated with the PubMed data.
    """
    try:
        pubmed = load_file(file_name, os.path.join(data_dir, "gn-meta/lit"))
        for reference in references:
            if pubmed.get(reference["doc_id"]):
                reference["pubmed"] = pubmed.get(reference["doc_id"])
        return references

    except FileNotFoundError:
        logging.error("failed to find pubmed_path for %s/%s",
                      data_dir, file_name)
        return references


def get_gnqa(query, auth_token, data_dir=""):
    """entry function for the gn3 api endpoint()
    ARGS:
         query: what is  a gene
         auth_token: token to connect to api_client
         data_dir:  base datirectory for gn3 data
    Returns:
         task_id: fahamu unique identifier for task
         answer
         references: contains doc_name,reference,pub_med_info
    """
    api_client = GeneNetworkQAClient(api_key=auth_token)
    res, task_id = api_client.ask('?ask=' + quote(query), query=query)
    res, _status = api_client.get_answer(task_id)
    resp_text = json.loads(''.join([str(char)
                                   for char in res.text if char in string.printable]))
    answer = resp_text['data']['answer']
    context = resp_text['data']['context']
    return task_id, answer, fetch_pubmed(parse_context(
                            context, DocIDs().get_info,
                            format_bibliography_info),
                            "pubmed.json", data_dir)