gn3/llms/process.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

"""this module contains code for processing response from fahamu client.py"""
# pylint: disable=C0301
import os
import re
import string
import json
import logging
from urllib.parse import quote

from gn3.llms.client import GeneNetworkQAClient


BASE_URL = 'https://genenetwork.fahamuai.com/api/tasks'
BASEDIR = os.path.abspath(os.path.dirname(__file__))


class DocIDs():
    """ Class Method to Parse document id and names from files"""
    def __init__(self):
        """
        init method for Docids
        * doc_ids.json: open doc_ids for gn references
        * sugar_doc_ids:  open doc_ids for diabetes references
        """
        self.doc_ids = load_file("doc_ids.json", BASEDIR)
        sugar_doc_ids = load_file("all_files.json", BASEDIR)
        self.format_doc_ids(sugar_doc_ids)

    def format_doc_ids(self, docs):
        """method to format doc_ids for list items doc_id and doc_name"""
        for _key, val in docs.items():
            if isinstance(val, list):
                for doc_obj in val:
                    doc_name = doc_obj["filename"].removesuffix(".pdf").removesuffix(".txt").replace("_", "")
                    self.doc_ids.update({doc_obj["id"]:  doc_name})

    def get_info(self, doc_id):
        """ interface to make read from doc_ids
           and extract info data  else returns
           doc_id
        Args:
            doc_id: str: a search key for doc_ids
        Returns:
              an object if doc id exists else
              raises a KeyError
        """
        return self.doc_ids[doc_id]

def format_bibliography_info(bib_info):
    """Utility function for formatting bibliography info
    """
    if isinstance(bib_info, str):
        return bib_info.removesuffix('.txt')
    elif isinstance(bib_info, dict):
        return f"{bib_info['author']}.{bib_info['title']}.{bib_info['year']}.{bib_info['doi']} "
    return bib_info


def parse_context(context, get_info_func, format_bib_func):
    """Function to parse doc_ids content
     Args:
         context: raw references from  fahamu api
         get_info_func: function to get doc_ids info
         format_bib_func:  function to foramt bibliography info
    Returns:
          an list with each item having (doc_id,bib_info,
          combined reference text)
    """
    results = []
    for doc_ids, summary in context.items():
        combo_txt = ""
        for entry in summary:
            combo_txt += "\t" + entry["text"]
        try:
            doc_info = get_info_func(doc_ids)
            bib_info = format_bib_func(doc_info)
        except KeyError:
            bib_info = doc_ids
        pattern = r'(https?://|www\.)[\w.-]+(\.[a-zA-Z]{2,})([/\w.-]*)*'
        combo_text = re.sub(pattern,
                            lambda x: f"<a href='{x[0]}' target=_blank> {x[0]} </a>",
                            combo_txt)
        results.append(
            {"doc_id": doc_ids, "bibInfo": bib_info,
             "comboTxt": combo_text})
    return results


def load_file(filename, dir_path):
    """Utility function to read json file
    Args:
        filename:  file name to read
        dir_path:  base directory for the file
    Returns: json data read to a dict
    """
    with open(os.path.join(dir_path, f"{filename}"),
              "rb") as file_handler:
        return json.load(file_handler)


def fetch_pubmed(references, file_name, data_dir=""):
    """
    Fetches PubMed data from a JSON file and populates the\
    references dictionary.

    Args:
        references (dict): Dictionary with document IDs as keys\
    and reference data as values.
        filename (str): Name of the JSON file containing PubMed data.
        data_dir (str): Base directory where the data files are located.

    Returns:
        dict: Updated references dictionary populated with the PubMed data.
    """
    try:
        pubmed = load_file(file_name, os.path.join(data_dir, "gn-meta/lit"))
        for reference in references:
            if pubmed.get(reference["doc_id"]):
                reference["pubmed"] = pubmed.get(reference["doc_id"])
        return references

    except FileNotFoundError:
        logging.error("failed to find pubmed_path for %s/%s",
                      data_dir, file_name)
        return references


def get_gnqa(query, auth_token, data_dir=""):
    """entry function for the gn3 api endpoint()
    ARGS:
         query: what is  a gene
         auth_token: token to connect to api_client
         data_dir:  base datirectory for gn3 data
    Returns:
         task_id: fahamu unique identifier for task
         answer
         references: contains doc_name,reference,pub_med_info
    """
    api_client = GeneNetworkQAClient(api_key=auth_token)
    res, task_id = api_client.ask('?ask=' + quote(query), query=query)
    res, _status = api_client.get_answer(task_id)
    resp_text = json.loads(''.join([str(char)
                           for char in res.text if char in string.printable]))
    answer = re.sub(r'(https?://|www\.)[\w.-]+(\.[a-zA-Z]{2,})([/\w.-]*)*',
                    lambda x: f"<a href='{x[0]}' target=_blank> {x[0]} </a>",
                    resp_text["data"]["answer"])
    context = resp_text['data']['context']
    return task_id, answer, fetch_pubmed(parse_context(
                            context, DocIDs().get_info,
                            format_bibliography_info),
                            "pubmed.json", data_dir)