aboutsummaryrefslogtreecommitdiff
path: root/gnqa
diff options
context:
space:
mode:
authorS. Solomon Darnell2025-03-28 21:52:21 -0500
committerS. Solomon Darnell2025-03-28 21:52:21 -0500
commit4a52a71956a8d46fcb7294ac71734504bb09bcc2 (patch)
treeee3dc5af3b6313e921cd920906356f5d4febc4ed /gnqa
parentcc961e04ba734dd72309fb548a2f97d67d578813 (diff)
downloadgn-ai-master.tar.gz
two version of R2R are hereHEADmaster
Diffstat (limited to 'gnqa')
m---------gnqa/R2R0
-rw-r--r--gnqa/data/study2/lists/human-questions.json10
-rwxr-xr-xgnqa/src/ingest_my_data.py72
-rw-r--r--gnqa/src/study2/document_operations.py32
-rw-r--r--gnqa/src/study2/run_questions.py19
-rw-r--r--gnqa/test.txt1
-rw-r--r--gnqa/test_r2r.py27
7 files changed, 151 insertions, 10 deletions
diff --git a/gnqa/R2R b/gnqa/R2R
deleted file mode 160000
-Subproject c61cf666addbacce695c66e717b8a9209d698e3
diff --git a/gnqa/data/study2/lists/human-questions.json b/gnqa/data/study2/lists/human-questions.json
index 4142e5b2..c6578922 100644
--- a/gnqa/data/study2/lists/human-questions.json
+++ b/gnqa/data/study2/lists/human-questions.json
@@ -168,5 +168,13 @@
"What is GeneNetwork and how does it relate to aging research?"
]
+ },
+ {
+ "level": "citizenscientist",
+ "domain": "asthma",
+ "query": [
+ "Do underrepresented groups have a higher incidence of asthma than whites in the USA?"
+
+ ]
}
-] \ No newline at end of file
+]
diff --git a/gnqa/src/ingest_my_data.py b/gnqa/src/ingest_my_data.py
new file mode 100755
index 00000000..2e696844
--- /dev/null
+++ b/gnqa/src/ingest_my_data.py
@@ -0,0 +1,72 @@
+from os import listdir
+from os.path import isfile, join
+import sys
+import time
+import datetime
+import configparser
+from r2r import R2RClient, R2RException
+
+cfg = configparser.ConfigParser()
+cfg.read('_config.cfg')
+
+client = R2RClient("http://localhost:7272/")
+client.documents.create_sample(hi_res=True)
+
+data_dir = cfg['DEFAULT']['PDF_DIR']
+
+dir_paths = [
+ data_dir+cfg['genetics']['diabetes'],
+ data_dir+cfg['genetics']['aging']
+]
+
+print("The defined directory paths {0}".format(dir_paths))
+
+def ingest_files(client, sleep_time, doc_list):
+ responses = []
+ resp = ()
+ for the_doc in doc_list:
+ #print(the_doc)
+ try:
+ resp = client.documents.create(file_path=the_doc)
+ print("Ingested {0} with the following response {1}".format(the_doc,resp))
+ except R2RException:
+ print("Problem ingesting {0}\n\t{1}".format(the_doc, sys.exception()))
+ #except:
+ #responses.append(resp)
+ # brace against pinging API too quickly
+ time.sleep(sleep_time)
+ return responses
+
+def create_file_list(the_dir):
+ return [ join(the_dir, f) for f in listdir(the_dir) if isfile(join(the_dir, f))]
+ #print("The list has {0} files".format(len(result)))
+ '''
+ # subroutine to test list content
+ for the_pdf in the_pdfs:
+ print('{0} -> {1}'.format(ndx, the_pdf))
+ ndx += 1
+ '''
+ return result
+
+def ingest_all_files(client, dir_list):
+ result = []
+ sleep_time = 3
+ print(datetime.datetime.now())
+ begin_ingesting = datetime.datetime.now()
+
+ for the_dir in dir_list:
+ the_list = create_file_list(the_dir)
+ result.append(ingest_files(client, sleep_time, the_list))
+
+ #ingest_resp = ()
+ #resps = ingest_files(client, sleep_time, file_list)
+ end_ingesting = datetime.datetime.now()
+
+ # show results of ingesting documents
+ #print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx,file_list, (end_ingesting-begin_ingesting), ingest_response))
+
+ # brace against pinging API too quickly
+ #time.sleep(sleeptime)
+
+
+ingest_all_files(client, dir_paths) \ No newline at end of file
diff --git a/gnqa/src/study2/document_operations.py b/gnqa/src/study2/document_operations.py
index 3112d915..f8ffdefe 100644
--- a/gnqa/src/study2/document_operations.py
+++ b/gnqa/src/study2/document_operations.py
@@ -1,6 +1,8 @@
import os
-#import sys
+import sys
import json
+#import inspect
+from r2r import RAGResponse
#import time
#import configparser
'''
@@ -37,6 +39,7 @@ class DocOps:
def writeDatasetFile(responses, outp_file):
print(outp_file)
output = json.dumps(responses, indent=2)
+
if os.path.exists(outp_file):
with open(outp_file, "a") as the_data:
the_data.write('\n\n' + output)
@@ -44,6 +47,33 @@ class DocOps:
with open(outp_file, "a") as the_data:
the_data.write(output)
+ def jsonifyRAGResponse(resps):
+ for resp in resps:
+ print("Num citations {0}\nAnswer --> {1}\n\t{2}".format(
+ len(resp.citations),
+ resp.generated_answer,
+ resp.metadata))
+
+ def writeRAGResponses(resps, outp_file):
+ print(outp_file)
+ for ndx in resps:
+ resp = resps[ndx]
+ #methods = [attr for attr in dir(obj) if not attr.startswith('_')] # Exclude private methods
+ #print(methods)
+ output = resp.model_dump()
+ output_to_write = resp.model_dump_json()
+ print("The answer --> {0}\nID --> {1}".format(
+ output["results"]["generated_answer"],
+ output["results"]["metadata"]["id"]))
+ if os.path.exists(outp_file):
+ with open(outp_file, "a") as the_data:
+ the_data.write('\n\n' + output_to_write)
+ else:
+ with open(outp_file, "a") as the_data:
+ the_data.write(output_to_write)
+
+
+
def get_r2r_ragas_out_dict():
return { "titles": [],
"extraction_id": [],
diff --git a/gnqa/src/study2/run_questions.py b/gnqa/src/study2/run_questions.py
index 07aee5f0..9bac5a23 100644
--- a/gnqa/src/study2/run_questions.py
+++ b/gnqa/src/study2/run_questions.py
@@ -2,17 +2,19 @@ import json
import sys
import os
-from r2r import R2RClient
-from study2.document_operations import DocOps, QuestionList
+from r2r import R2RClient, RAGResponse
+from document_operations import DocOps, QuestionList
'''
*******************************************************************************
Variables
*******************************************************************************
'''
+rag_gen_cfg = {"model": "openai/gpt-4o-mini", "temperature": 0.0, "use_hybrid_search": True}
rag_response = {}
-client = R2RClient("http://localhost:8000")
-health_resp = client.health()
+#client = R2RClient("http://localhost:8000")
+client = R2RClient("http://localhost:7272")
+#health_resp = client.health()
'''
*******************************************************************************
@@ -20,19 +22,20 @@ Commands
*******************************************************************************
'''
-print("The R2R client's health status is {0}".format(health_resp))
+#print("The R2R client's health status is {0}".format(health_resp))
try:
read_file = str(sys.argv[1])
out_file = str(sys.argv[2])
except:
- exit('Example use "python run_questions.py ../data/questions/human/de/aging.json ../data/responses/human/de/aging_resp.json"')
+ exit('Example use "python run_questions.py ../../data/study2/lists/human-questions.json ../../data/test_study/human/de/aging_resp.json"')
qLst = QuestionList(read_file, 1) # second parameter is for verbose output
ndx = 1
for question in qLst.get("domainexpert","aging"):
print('Getting response for the following question --> {0}'.format(question))
- rag_response[str(ndx)] = client.rag(question)
+ #rag_response[str(ndx)] = client.rag(question)
+ rag_response[str(ndx)] = client.retrieval.rag(question, rag_gen_cfg)
ndx += 1
-DocOps.writeDatasetFile(rag_response, out_file) \ No newline at end of file
+DocOps.writeRAGResponses(rag_response, out_file) \ No newline at end of file
diff --git a/gnqa/test.txt b/gnqa/test.txt
new file mode 100644
index 00000000..24b5ce9e
--- /dev/null
+++ b/gnqa/test.txt
@@ -0,0 +1 @@
+John is a person that works at Google. \ No newline at end of file
diff --git a/gnqa/test_r2r.py b/gnqa/test_r2r.py
new file mode 100644
index 00000000..65052378
--- /dev/null
+++ b/gnqa/test_r2r.py
@@ -0,0 +1,27 @@
+from r2r import R2RClient
+
+client = R2RClient()
+
+#with open("test.txt", "w") as file:
+
+# file.write("John is a person that works at Google.")
+
+#client.documents.create(file_path="test.txt")
+
+# Call RAG directly
+
+rag_response = client.retrieval.rag(
+
+ query="What is the role of obesity in diabetes",
+
+ rag_generation_config={"model": "openai/gpt-4o-mini", "temperature": 0.0},
+
+)
+
+print(f"Search Results:\n{rag_response.results.search_results}")
+
+
+print(f"Completion:\n{rag_response.results.generated_answer}")
+
+
+print(f"Citations:\n{rag_response.results.citations}")