about summary refs log tree commit diff
path: root/gnqa
diff options
context:
space:
mode:
Diffstat (limited to 'gnqa')
m---------gnqa/R2R0
-rw-r--r--gnqa/data/study2/lists/human-questions.json10
-rwxr-xr-xgnqa/src/ingest_my_data.py72
-rw-r--r--gnqa/src/study2/document_operations.py32
-rw-r--r--gnqa/src/study2/run_questions.py19
-rw-r--r--gnqa/test.txt1
-rw-r--r--gnqa/test_r2r.py27
7 files changed, 151 insertions, 10 deletions
diff --git a/gnqa/R2R b/gnqa/R2R
deleted file mode 160000
-Subproject c61cf666addbacce695c66e717b8a9209d698e3
diff --git a/gnqa/data/study2/lists/human-questions.json b/gnqa/data/study2/lists/human-questions.json
index 4142e5b2..c6578922 100644
--- a/gnqa/data/study2/lists/human-questions.json
+++ b/gnqa/data/study2/lists/human-questions.json
@@ -168,5 +168,13 @@
             "What is GeneNetwork and how does it relate to aging research?"
 
         ]
+    },
+    {
+        "level": "citizenscientist",
+        "domain": "asthma",
+        "query": [
+            "Do underrepresented groups have a higher incidence of asthma than whites in the USA?"
+
+        ]
     }
-]
\ No newline at end of file
+]
diff --git a/gnqa/src/ingest_my_data.py b/gnqa/src/ingest_my_data.py
new file mode 100755
index 00000000..2e696844
--- /dev/null
+++ b/gnqa/src/ingest_my_data.py
@@ -0,0 +1,72 @@
+from os import listdir
+from os.path import isfile, join
+import sys
+import time
+import datetime
+import configparser
+from r2r import R2RClient, R2RException
+
+cfg = configparser.ConfigParser()
+cfg.read('_config.cfg')
+
+client = R2RClient("http://localhost:7272/")
+client.documents.create_sample(hi_res=True)
+
+data_dir = cfg['DEFAULT']['PDF_DIR']
+
+dir_paths = [
+    data_dir+cfg['genetics']['diabetes'],
+    data_dir+cfg['genetics']['aging']
+]
+
+print("The defined directory paths {0}".format(dir_paths))
+
+def ingest_files(client, sleep_time, doc_list):
+    responses = []
+    resp      = ()
+    for the_doc in doc_list:
+        #print(the_doc)
+        try:
+            resp = client.documents.create(file_path=the_doc)
+            print("Ingested {0} with the following response {1}".format(the_doc,resp))
+        except R2RException:
+            print("Problem ingesting {0}\n\t{1}".format(the_doc, sys.exception()))
+        #except:
+        #responses.append(resp)
+        # brace against pinging API too quickly
+        time.sleep(sleep_time)
+    return responses
+
+def create_file_list(the_dir):
+    return [ join(the_dir, f) for f in listdir(the_dir) if isfile(join(the_dir, f))]
+    #print("The list has {0} files".format(len(result)))
+    '''
+    # subroutine to test list content
+    for the_pdf in the_pdfs:
+        print('{0} -> {1}'.format(ndx, the_pdf))
+        ndx += 1
+    '''
+    return result
+
+def ingest_all_files(client, dir_list):
+    result = []
+    sleep_time = 3
+    print(datetime.datetime.now())
+    begin_ingesting = datetime.datetime.now()
+
+    for the_dir in dir_list:
+        the_list = create_file_list(the_dir)
+        result.append(ingest_files(client, sleep_time, the_list))
+
+    #ingest_resp = ()
+    #resps = ingest_files(client, sleep_time, file_list)
+    end_ingesting = datetime.datetime.now()
+
+    # show results of ingesting documents
+    #print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx,file_list, (end_ingesting-begin_ingesting), ingest_response))
+
+    # brace against pinging API too quickly
+    #time.sleep(sleeptime)
+
+
+ingest_all_files(client, dir_paths)
\ No newline at end of file
diff --git a/gnqa/src/study2/document_operations.py b/gnqa/src/study2/document_operations.py
index 3112d915..f8ffdefe 100644
--- a/gnqa/src/study2/document_operations.py
+++ b/gnqa/src/study2/document_operations.py
@@ -1,6 +1,8 @@
 import os
-#import sys
+import sys
 import json
+#import inspect
+from r2r import RAGResponse
 #import time
 #import configparser
 '''
@@ -37,6 +39,7 @@ class DocOps:
     def writeDatasetFile(responses, outp_file):
         print(outp_file)
         output = json.dumps(responses, indent=2)
+
         if os.path.exists(outp_file):
             with open(outp_file, "a") as the_data:
                 the_data.write('\n\n' + output)
@@ -44,6 +47,33 @@ class DocOps:
             with open(outp_file, "a") as the_data:
                 the_data.write(output)
 
+    def jsonifyRAGResponse(resps):
+        for resp in resps:
+            print("Num citations {0}\nAnswer --> {1}\n\t{2}".format(
+                len(resp.citations), 
+                resp.generated_answer, 
+                resp.metadata))
+
+    def writeRAGResponses(resps, outp_file):
+        print(outp_file)
+        for ndx in resps:
+            resp = resps[ndx]
+            #methods = [attr for attr in dir(obj) if not attr.startswith('_')]  # Exclude private methods
+            #print(methods)
+            output = resp.model_dump()
+            output_to_write = resp.model_dump_json()
+            print("The answer -->  {0}\nID --> {1}".format(
+                output["results"]["generated_answer"],
+                output["results"]["metadata"]["id"]))
+            if os.path.exists(outp_file):
+                with open(outp_file, "a") as the_data:
+                    the_data.write('\n\n' + output_to_write)
+            else:
+                with open(outp_file, "a") as the_data:
+                    the_data.write(output_to_write)
+
+
+
     def get_r2r_ragas_out_dict():
         return { "titles":        [],
                 "extraction_id": [],
diff --git a/gnqa/src/study2/run_questions.py b/gnqa/src/study2/run_questions.py
index 07aee5f0..9bac5a23 100644
--- a/gnqa/src/study2/run_questions.py
+++ b/gnqa/src/study2/run_questions.py
@@ -2,17 +2,19 @@ import json
 import sys
 import os
 
-from r2r import R2RClient
-from study2.document_operations import DocOps, QuestionList
+from r2r import R2RClient, RAGResponse
+from document_operations import DocOps, QuestionList
 
 '''
 *******************************************************************************
 Variables
 *******************************************************************************
 '''
+rag_gen_cfg = {"model": "openai/gpt-4o-mini", "temperature": 0.0, "use_hybrid_search": True}
 rag_response = {}
-client       = R2RClient("http://localhost:8000")
-health_resp  = client.health()
+#client       = R2RClient("http://localhost:8000")
+client       = R2RClient("http://localhost:7272")
+#health_resp  = client.health()
 
 '''
 *******************************************************************************
@@ -20,19 +22,20 @@ Commands
 *******************************************************************************
 '''
 
-print("The R2R client's health status is {0}".format(health_resp))
+#print("The R2R client's health status is {0}".format(health_resp))
 
 try:
     read_file = str(sys.argv[1])
     out_file  = str(sys.argv[2])
 except:
-    exit('Example use "python run_questions.py ../data/questions/human/de/aging.json ../data/responses/human/de/aging_resp.json"')
+    exit('Example use "python run_questions.py ../../data/study2/lists/human-questions.json ../../data/test_study/human/de/aging_resp.json"')
 
 qLst = QuestionList(read_file, 1) # second parameter is for verbose output
 ndx = 1
 for question in qLst.get("domainexpert","aging"):
     print('Getting response for the following question --> {0}'.format(question))
-    rag_response[str(ndx)] = client.rag(question)
+    #rag_response[str(ndx)] = client.rag(question)
+    rag_response[str(ndx)] = client.retrieval.rag(question, rag_gen_cfg)
     ndx += 1
 
-DocOps.writeDatasetFile(rag_response, out_file)
\ No newline at end of file
+DocOps.writeRAGResponses(rag_response, out_file)
\ No newline at end of file
diff --git a/gnqa/test.txt b/gnqa/test.txt
new file mode 100644
index 00000000..24b5ce9e
--- /dev/null
+++ b/gnqa/test.txt
@@ -0,0 +1 @@
+John is a person that works at Google.
\ No newline at end of file
diff --git a/gnqa/test_r2r.py b/gnqa/test_r2r.py
new file mode 100644
index 00000000..65052378
--- /dev/null
+++ b/gnqa/test_r2r.py
@@ -0,0 +1,27 @@
+from r2r import R2RClient
+
+client = R2RClient()
+
+#with open("test.txt", "w") as file:
+
+#    file.write("John is a person that works at Google.")
+
+#client.documents.create(file_path="test.txt")
+
+# Call RAG directly
+
+rag_response = client.retrieval.rag(
+
+    query="What is the role of obesity in diabetes",
+
+    rag_generation_config={"model": "openai/gpt-4o-mini", "temperature": 0.0},
+
+)
+
+print(f"Search Results:\n{rag_response.results.search_results}")
+
+
+print(f"Completion:\n{rag_response.results.generated_answer}")
+
+
+print(f"Citations:\n{rag_response.results.citations}")