diff options
Diffstat (limited to 'gnqa')
m--------- | gnqa/R2R | 0 | ||||
-rw-r--r-- | gnqa/data/study2/lists/human-questions.json | 10 | ||||
-rwxr-xr-x | gnqa/src/ingest_my_data.py | 72 | ||||
-rw-r--r-- | gnqa/src/study2/document_operations.py | 32 | ||||
-rw-r--r-- | gnqa/src/study2/run_questions.py | 19 | ||||
-rw-r--r-- | gnqa/test.txt | 1 | ||||
-rw-r--r-- | gnqa/test_r2r.py | 27 |
7 files changed, 151 insertions, 10 deletions
diff --git a/gnqa/R2R b/gnqa/R2R deleted file mode 160000 -Subproject c61cf666addbacce695c66e717b8a9209d698e3 diff --git a/gnqa/data/study2/lists/human-questions.json b/gnqa/data/study2/lists/human-questions.json index 4142e5b2..c6578922 100644 --- a/gnqa/data/study2/lists/human-questions.json +++ b/gnqa/data/study2/lists/human-questions.json @@ -168,5 +168,13 @@ "What is GeneNetwork and how does it relate to aging research?" ] + }, + { + "level": "citizenscientist", + "domain": "asthma", + "query": [ + "Do underrepresented groups have a higher incidence of asthma than whites in the USA?" + + ] } -]
\ No newline at end of file +] diff --git a/gnqa/src/ingest_my_data.py b/gnqa/src/ingest_my_data.py new file mode 100755 index 00000000..2e696844 --- /dev/null +++ b/gnqa/src/ingest_my_data.py @@ -0,0 +1,72 @@ +from os import listdir +from os.path import isfile, join +import sys +import time +import datetime +import configparser +from r2r import R2RClient, R2RException + +cfg = configparser.ConfigParser() +cfg.read('_config.cfg') + +client = R2RClient("http://localhost:7272/") +client.documents.create_sample(hi_res=True) + +data_dir = cfg['DEFAULT']['PDF_DIR'] + +dir_paths = [ + data_dir+cfg['genetics']['diabetes'], + data_dir+cfg['genetics']['aging'] +] + +print("The defined directory paths {0}".format(dir_paths)) + +def ingest_files(client, sleep_time, doc_list): + responses = [] + resp = () + for the_doc in doc_list: + #print(the_doc) + try: + resp = client.documents.create(file_path=the_doc) + print("Ingested {0} with the following response {1}".format(the_doc,resp)) + except R2RException: + print("Problem ingesting {0}\n\t{1}".format(the_doc, sys.exception())) + #except: + #responses.append(resp) + # brace against pinging API too quickly + time.sleep(sleep_time) + return responses + +def create_file_list(the_dir): + return [ join(the_dir, f) for f in listdir(the_dir) if isfile(join(the_dir, f))] + #print("The list has {0} files".format(len(result))) + ''' + # subroutine to test list content + for the_pdf in the_pdfs: + print('{0} -> {1}'.format(ndx, the_pdf)) + ndx += 1 + ''' + return result + +def ingest_all_files(client, dir_list): + result = [] + sleep_time = 3 + print(datetime.datetime.now()) + begin_ingesting = datetime.datetime.now() + + for the_dir in dir_list: + the_list = create_file_list(the_dir) + result.append(ingest_files(client, sleep_time, the_list)) + + #ingest_resp = () + #resps = ingest_files(client, sleep_time, file_list) + end_ingesting = datetime.datetime.now() + + # show results of ingesting documents + #print("Entry [{0}]\t{1} {2}\n\t\t{3}".format(ndx,file_list, (end_ingesting-begin_ingesting), ingest_response)) + + # brace against pinging API too quickly + #time.sleep(sleeptime) + + +ingest_all_files(client, dir_paths)
\ No newline at end of file diff --git a/gnqa/src/study2/document_operations.py b/gnqa/src/study2/document_operations.py index 3112d915..f8ffdefe 100644 --- a/gnqa/src/study2/document_operations.py +++ b/gnqa/src/study2/document_operations.py @@ -1,6 +1,8 @@ import os -#import sys +import sys import json +#import inspect +from r2r import RAGResponse #import time #import configparser ''' @@ -37,6 +39,7 @@ class DocOps: def writeDatasetFile(responses, outp_file): print(outp_file) output = json.dumps(responses, indent=2) + if os.path.exists(outp_file): with open(outp_file, "a") as the_data: the_data.write('\n\n' + output) @@ -44,6 +47,33 @@ class DocOps: with open(outp_file, "a") as the_data: the_data.write(output) + def jsonifyRAGResponse(resps): + for resp in resps: + print("Num citations {0}\nAnswer --> {1}\n\t{2}".format( + len(resp.citations), + resp.generated_answer, + resp.metadata)) + + def writeRAGResponses(resps, outp_file): + print(outp_file) + for ndx in resps: + resp = resps[ndx] + #methods = [attr for attr in dir(obj) if not attr.startswith('_')] # Exclude private methods + #print(methods) + output = resp.model_dump() + output_to_write = resp.model_dump_json() + print("The answer --> {0}\nID --> {1}".format( + output["results"]["generated_answer"], + output["results"]["metadata"]["id"])) + if os.path.exists(outp_file): + with open(outp_file, "a") as the_data: + the_data.write('\n\n' + output_to_write) + else: + with open(outp_file, "a") as the_data: + the_data.write(output_to_write) + + + def get_r2r_ragas_out_dict(): return { "titles": [], "extraction_id": [], diff --git a/gnqa/src/study2/run_questions.py b/gnqa/src/study2/run_questions.py index 07aee5f0..9bac5a23 100644 --- a/gnqa/src/study2/run_questions.py +++ b/gnqa/src/study2/run_questions.py @@ -2,17 +2,19 @@ import json import sys import os -from r2r import R2RClient -from study2.document_operations import DocOps, QuestionList +from r2r import R2RClient, RAGResponse +from document_operations import DocOps, QuestionList ''' ******************************************************************************* Variables ******************************************************************************* ''' +rag_gen_cfg = {"model": "openai/gpt-4o-mini", "temperature": 0.0, "use_hybrid_search": True} rag_response = {} -client = R2RClient("http://localhost:8000") -health_resp = client.health() +#client = R2RClient("http://localhost:8000") +client = R2RClient("http://localhost:7272") +#health_resp = client.health() ''' ******************************************************************************* @@ -20,19 +22,20 @@ Commands ******************************************************************************* ''' -print("The R2R client's health status is {0}".format(health_resp)) +#print("The R2R client's health status is {0}".format(health_resp)) try: read_file = str(sys.argv[1]) out_file = str(sys.argv[2]) except: - exit('Example use "python run_questions.py ../data/questions/human/de/aging.json ../data/responses/human/de/aging_resp.json"') + exit('Example use "python run_questions.py ../../data/study2/lists/human-questions.json ../../data/test_study/human/de/aging_resp.json"') qLst = QuestionList(read_file, 1) # second parameter is for verbose output ndx = 1 for question in qLst.get("domainexpert","aging"): print('Getting response for the following question --> {0}'.format(question)) - rag_response[str(ndx)] = client.rag(question) + #rag_response[str(ndx)] = client.rag(question) + rag_response[str(ndx)] = client.retrieval.rag(question, rag_gen_cfg) ndx += 1 -DocOps.writeDatasetFile(rag_response, out_file)
\ No newline at end of file +DocOps.writeRAGResponses(rag_response, out_file)
\ No newline at end of file diff --git a/gnqa/test.txt b/gnqa/test.txt new file mode 100644 index 00000000..24b5ce9e --- /dev/null +++ b/gnqa/test.txt @@ -0,0 +1 @@ +John is a person that works at Google.
\ No newline at end of file diff --git a/gnqa/test_r2r.py b/gnqa/test_r2r.py new file mode 100644 index 00000000..65052378 --- /dev/null +++ b/gnqa/test_r2r.py @@ -0,0 +1,27 @@ +from r2r import R2RClient + +client = R2RClient() + +#with open("test.txt", "w") as file: + +# file.write("John is a person that works at Google.") + +#client.documents.create(file_path="test.txt") + +# Call RAG directly + +rag_response = client.retrieval.rag( + + query="What is the role of obesity in diabetes", + + rag_generation_config={"model": "openai/gpt-4o-mini", "temperature": 0.0}, + +) + +print(f"Search Results:\n{rag_response.results.search_results}") + + +print(f"Completion:\n{rag_response.results.generated_answer}") + + +print(f"Citations:\n{rag_response.results.citations}") |