1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
"""this module contains code for processing response from fahamu client.py"""
# pylint: disable=C0301
import os
import string
import json
import logging
from urllib.parse import quote
from gn3.llms.client import GeneNetworkQAClient
BASE_URL = 'https://genenetwork.fahamuai.com/api/tasks'
BASEDIR = os.path.abspath(os.path.dirname(__file__))
class DocIDs():
""" Class Method to Parse document id and names from files"""
def __init__(self):
"""
init method for Docids
* doc_ids.json: opens doc)ids for gn references
* sugar_doc_ids: open doci_ids for diabetes references
"""
self.doc_ids = self.load_file("doc_ids.json")
self.sugar_doc_ids = self.load_file("all_files.json")
self.format_doc_ids(self.sugar_doc_ids)
def load_file(self, file_name):
"""Method to load and read doc_id files"""
file_path = os.path.join(BASEDIR, file_name)
if os.path.isfile(file_path):
with open(file_path, "rb") as file_handler:
return json.load(file_handler)
else:
raise FileNotFoundError(f"{file_path}-- FIle does not exist\n")
def format_doc_ids(self, docs):
"""method to format doc_ids for list items"""
for _key, val in docs.items():
if isinstance(val, list):
for doc_obj in val:
doc_name = doc_obj["filename"].removesuffix(".pdf").removesuffix(".txt").replace("_", "")
self.doc_ids.update({doc_obj["id"]: doc_name})
def get_info(self, doc_id):
""" interface to make read from doc_ids"""
if doc_id in self.doc_ids.keys():
return self.doc_ids[doc_id]
else:
return doc_id
def format_bibliography_info(bib_info):
"""Function for formatting bibliography info"""
if isinstance(bib_info, str):
return bib_info.removesuffix('.txt')
elif isinstance(bib_info, dict):
return f"{bib_info['author']}.{bib_info['title']}.{bib_info['year']}.{bib_info['doi']} "
return bib_info
def filter_response_text(val):
"""helper function for filtering non-printable chars"""
return json.loads(''.join([str(char)
for char in val if char in string.printable]))
def parse_context(context, get_info_func, format_bib_func):
"""function to parse doc_ids content"""
results = []
for doc_ids, summary in context.items():
combo_txt = ""
for entry in summary:
combo_txt += "\t" + entry["text"]
doc_info = get_info_func(doc_ids)
bib_info = doc_ids if doc_ids == doc_info else format_bib_func(
doc_info)
results.append(
{"doc_id": doc_ids, "bibInfo": bib_info, "comboTxt": combo_txt})
return results
def load_file(filename, dir_path):
"""function to open and load json file"""
file_path = os.path.join(dir_path, f"{filename}")
if not os.path.isfile(file_path):
raise FileNotFoundError(f"{filename} was not found or is a directory")
with open(file_path, "rb") as file_handler:
return json.load(file_handler)
def fetch_pubmed(references, file_name, data_dir=""):
"""method to fetch and populate references with pubmed"""
try:
pubmed = load_file(file_name, os.path.join(data_dir, "gn-meta/lit"))
for reference in references:
if pubmed.get(reference["doc_id"]):
reference["pubmed"] = pubmed.get(reference["doc_id"])
return references
except FileNotFoundError:
logging.error("failed to find pubmed_path for %s/%s",
data_dir, file_name)
return references
def get_gnqa(query, auth_token, data_dir=""):
"""entry function for the gn3 api endpoint()
ARGS:
query: what is a gene
auth_token: token to connect to api_client
data_dir: base datirectory for gn3 data
Returns:
task_id: fahamu unique identifier for task
answer
references: contains doc_name,reference,pub_med_info
"""
api_client = GeneNetworkQAClient(api_key=auth_token)
res, task_id = api_client.ask('?ask=' + quote(query), query=query)
res, _status = api_client.get_answer(task_id)
resp_text = filter_response_text(res.text)
answer = resp_text['data']['answer']
context = resp_text['data']['context']
return task_id, answer, fetch_pubmed(parse_context(
context, DocIDs().get_info, format_bibliography_info), "pubmed.json", data_dir)
|