diff options
author | ShelbySolomonDarnell | 2024-09-24 23:40:11 +0300 |
---|---|---|
committer | ShelbySolomonDarnell | 2024-09-24 23:40:11 +0300 |
commit | cb28feac47dc1f6147260f1bc057970c54b314f3 (patch) | |
tree | 6c45082bc4d723468609f774170339c718576873 /gnqa/paper2_eval/src | |
parent | ea942f68346abcd6e51d1cc96b0c90361c3cdfa2 (diff) | |
download | gn-ai-cb28feac47dc1f6147260f1bc057970c54b314f3.tar.gz |
Human questions in json format, code for formatting r2r response
Diffstat (limited to 'gnqa/paper2_eval/src')
-rw-r--r-- | gnqa/paper2_eval/src/parse_r2r_result.ipynb | 262 | ||||
-rw-r--r-- | gnqa/paper2_eval/src/parse_r2r_result.py | 44 | ||||
-rw-r--r-- | gnqa/paper2_eval/src/run_questions.py | 45 |
3 files changed, 351 insertions, 0 deletions
diff --git a/gnqa/paper2_eval/src/parse_r2r_result.ipynb b/gnqa/paper2_eval/src/parse_r2r_result.ipynb new file mode 100644 index 0000000..8ceac72 --- /dev/null +++ b/gnqa/paper2_eval/src/parse_r2r_result.ipynb @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "#Parse RAGAS json output" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import sys\n", + "\n", + "verbose = 0\n", + "read_file = '/data/code/gn-ai/gnqa/paper2_eval/data/rag_out_1.json'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "def iterate_json(obj, thedict):\n", + " if isinstance(obj, dict):\n", + " for key, val in obj.items():\n", + " if (key == \"text\"):\n", + " thedict[\"contexts\"].append(val.replace(\"\\n\", \" \").strip())\n", + " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n", + " elif (key == \"associatedQuery\"):\n", + " #thedict[\"answer\"] = val#.replace(\"\\n\", \" \").strip()\n", + " thedict[\"question\"] = val#.replace(\"\\n\", \" \").strip()\n", + " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n", + " elif (key == \"id\"):\n", + " thedict[\"id\"].append(val.replace(\"\\n\", \" \").strip())\n", + " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n", + " elif (key == \"title\"):\n", + " thedict[\"titles\"].append(val.replace(\"\\n\", \" \").strip())\n", + " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n", + " elif (key == \"document_id\"):\n", + " thedict[\"document_id\"].append(val.replace(\"\\n\", \" \").strip())\n", + " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n", + " elif (key == \"content\"):\n", + " thedict[\"answer\"] = val.replace(\"\\n\", \" \").strip()\n", + " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n", + " elif (key == \"extraction_id\"):\n", + " thedict[\"extraction_id\"].append(val.replace(\"\\n\", \" \").strip())\n", + " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n", + " else:\n", + " if (len(obj.items()) == 1 ):\n", + " print(key, \" --> \", val)\n", + " iterate_json(val, thedict)\n", + " elif isinstance(obj, list):\n", + " for item in obj:\n", + " iterate_json(item, thedict)\n", + "\n", + "# this should be a json file with a list of input files and an output file\n", + "with open(read_file, \"r\") as r_file:\n", + " result_file = json.load(r_file)\n", + "\n", + "ragas_output = {\n", + " \"titles\": [],\n", + " \"extraction_id\": [],\n", + " \"document_id\": [],\n", + " \"id\": [],\n", + " \"contexts\": [],\n", + " \"answer\": \"\",\n", + " \"question\": \"\"}\n", + "vector_search_results = result_file[\"vector_search_results\"]\n", + "choices = result_file[\"choices\"]\n", + "iterate_json(vector_search_results, ragas_output)\n", + "iterate_json(choices, ragas_output)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"titles\": [\n", + " \"2011 - Annotating individual human genomes.pdf\",\n", + " \"2011 - Annotating individual human genomes.pdf\",\n", + " \"2011 - Annotating individual human genomes.pdf\",\n", + " \"2012 - Systems Biology Approaches to Nutrition.pdf\",\n", + " \"2006 - \\u03b22-adrenergic receptor and UCP3 variants modulate the relationship between age and type 2 diabetes mellitus.pdf\",\n", + " \"2001 - Demography in the age of genomics.pdf\",\n", + " \"2012 - Systems Biology Approaches to Nutrition.pdf\",\n", + " \"2004 - A genome scan for diabetic nephropathy in African Americans.pdf\",\n", + " \"2011 - Annotating individual human genomes.pdf\",\n", + " \"2012 - Systems Biology Approaches to Nutrition.pdf\"\n", + " ],\n", + " \"extraction_id\": [\n", + " \"80d78615-8424-5478-a01b-73e220bc0345\",\n", + " \"80d78615-8424-5478-a01b-73e220bc0345\",\n", + " \"80d78615-8424-5478-a01b-73e220bc0345\",\n", + " \"eb3de845-98db-505c-bb7f-c0f3259875fc\",\n", + " \"acf69ed8-c7b0-5d9f-8005-de020c9cf699\",\n", + " \"5f24a851-1de6-5b6e-8230-2da08806b01a\",\n", + " \"eb3de845-98db-505c-bb7f-c0f3259875fc\",\n", + " \"01ec7832-8a80-5f5e-aa26-3648f572c4a1\",\n", + " \"c17c74fa-12a3-5072-bb48-c179055db14d\",\n", + " \"eb3de845-98db-505c-bb7f-c0f3259875fc\"\n", + " ],\n", + " \"document_id\": [\n", + " \"f7b5d738-3f0b-5074-9c21-f6b443b4e07f\",\n", + " \"f7b5d738-3f0b-5074-9c21-f6b443b4e07f\",\n", + " \"f7b5d738-3f0b-5074-9c21-f6b443b4e07f\",\n", + " \"6955478b-950d-5d29-b24c-3a5ca656f3ae\",\n", + " \"0ea34c04-5d09-5a32-89a7-c3add179927a\",\n", + " \"0f07fa43-feb6-5656-b7e7-b8faa86f5623\",\n", + " \"6955478b-950d-5d29-b24c-3a5ca656f3ae\",\n", + " \"5798fb6b-b3e6-57c4-9823-5428853dbfa1\",\n", + " \"f7b5d738-3f0b-5074-9c21-f6b443b4e07f\",\n", + " \"6955478b-950d-5d29-b24c-3a5ca656f3ae\"\n", + " ],\n", + " \"id\": [\n", + " \"7656b48b-d191-516e-9753-d34efedd4812\",\n", + " \"d1af5c82-d226-5980-b5d9-90d7558d1880\",\n", + " \"bb2a67ec-135b-5d55-b33d-74b1dc085685\",\n", + " \"11fc663d2-2833-51e7-ae6a-55b007a6e27c\",\n", + " \"bb55a705-7399-550e-8285-07c33654b909\",\n", + " \"9bff43c0-fd12-572e-9996-24957edd17d2\",\n", + " \"2df84ccc-0d32-582e-bda6-9cd46bee5378\",\n", + " \"ff30f187-d5c3-5d01-8026-0588a77e9f44\",\n", + " \"57dc1ee5-4252-52c3-92cb-e2ac36cdc4d6\",\n", + " \"ff801099-e737-57b1-91af-a4cea20adb87\"\n", + " ],\n", + " \"contexts\": [\n", + " \"gene interaction and high predictive value, PLoS One 3 (5) (2008) e2031,doi:10.1371/journal.pone.0002031 . [107] M. van Hoek, A. Dehghan, J.C. Witteman, C.M. van Duijn, A.G. Uitterlinden, B.A. Oostra, A. Hofman, E.J. Sijbrands, A.C. Janssens, Predicting type 2 diabetes based on polymorphisms from genome-wide association studies: a population-based study, Diabetes 57 (11) (Nov 2008) 3122 3128. [108] Q. Lu, Y. Song, X. Wang, S. Won, Y. Cui, R.C. Elston, The effect of multiple genetic\",\n", + " \"variants in predicting the risk of type 2 diabetes, BMC Proc 3 (Suppl 7) (Dec 15 2009) S49. [109] K. Miyake, W. Yang, K. Hara, K. Yasuda, Y. Horikawa, H. Osawa, H. Furuta, et al., Construction of a prediction model for type 2 diabetes mellitus in the Japanese population based on 11 genes with strong evidence of the association, J. Hum. Genet. 54 (4) (Apr 2009) 236 241 [Epub 2009 Feb 27]. [110] P.J. Talmud, A.D. Hingorani, J.A. Cooper, M.G. Marmot, E.J. Brunner, M. Kumari, M.\",\n", + " \"type 2 diabetes risk, Diabetes 57 (11) (Nov 2008) 3129 3135. [103] Q. Lu, R.C. Elston, Using the optimal receiver operating characteristic curve to design a predictive genetic test, exempli ed with type 2 diabetes, Am. J. Hum. Genet. 82 (3) (Mar 2008) 641 651. [104] V. Lyssenko, A. Jonsson, P. Almgren, N. Pulizzi, B. Isomaa, T. Tuomi, G. Berglund, D. Altshuler, P. Nilsson, L. Groop, Clinical risk factors, DNA variants, and the development of type 2 diabetes, N. Engl. J. Med. 359 (21) (Nov 20 2008)\",\n", + " \"insulin resistance, hypertension, and dyslipidemia (Obesity Education Initiative Expert Panel, 1998 ). Insulin resist-ance increases with age, and the incidence of diabetes rises sharply in the elderly (American Diabetes Association, 2010a ). In a few patients, genetic mutations appear to be associ- ated with T2D (Roche et al. , 2005 ; American Diabetes Association, 2010a ). For example, recent work using the DPP data has led to the identi cation of 27 single nucle-\",\n", + " \"19. Permutt MA, Wasson J, Cox N: Genetic epidemiology of diabe- tes. J Clin Invest 2005, 115:1431-1439. 20. Barroso I: Genetics of Type 2 diabetes. Diabet Med 2005, 22:517-535. 21. Parikh H, Groop L: Candidate genes for type 2 diabetes. Rev Endocr Metab Disord 2004, 5:151-176. 22. Lohmueller KE, Pearce CL, Pike M, Lander ES, Hirschhorn JN: Meta- analysis of genetic association studies supports a contribu- tion of common variants to su sceptibility to common dis- ease. Nat Genet 2003, 33:177-182.\",\n", + " \"insulin-dependent diabetes and schizophrenia, twin studies have demon-strated the existence of a significant genetic component (Kyvik et al., 1995;Plomin et al., 1994). Genetic factors also influence cardiovascular diseaseswhich occur in early or midlife, while for cardiovascular diseases occur-ring late in life there is little evidence of a genetic effect (Marenberg et al.,1994). Dementia has a very strong genetic component, not only withregard to early-onset monogenic types but also to late-onset\",\n", + " \"Three categories of increased risk of developing diabetes are currently recognized by the ADA: an FPG between 5.6 and 6.9 mmol/L (100 and 125 mg/dL), de ned as having impaired fasting glucose (IFG); a 2 - h OGTT between 7.8 and 11 mmol/L (140 and 199 mg/dL), de ned as having impaired glucose tolerance (IGT); an A1C between 5.7 and 6.4% with values between 6.0 and 6.4 considered very high risk (American Diabetes Association, 2010a ). It is estimated that approximately one - fourth of indi-\",\n", + " \"20 90 D20S451 0.006 10.7 5.4 (34) 8.42 5.4 (61) 0.30 (long duration) Interaction with age at diagnosis of diabetes 19 1 D1S1665 0.004 37.4 8.1 (66) 41.2 8.3 (81) 0.23 (early onset) 2 159 D2S1399/D2S1353 0.023 40.8 8.2 (53) 38.8 8.5 (94) 0.16 (late onset) 3 135 D3S2460 0.036 37.7 8.6 (66) 41.0 8.0 (81) 0.16 (early onset) 4 146 D4S1625 0.005 37.9 7.4 (52) 40.4 8.9 (95) 0.23 (early onset) 65 5 D6S2427 0.024 38.0 7.8 (70) 40.9 8.8 (77) 0.18 (early onset)\",\n", + " \"prevention, even though there are great concerns about how such information will be perceived and utilized [68]. Recent studies in diabetes and cardiovascular disease suggest that the addition of previously associated genetic risk loci in clinical risk models of those diseases increases both discriminative and predictive accuracy, albeit only marginally. Typically, the strongest predictors of disease onset are known clinical risk factors such as body mass index, age, or\",\n", + " \"and sex, weight for height greater than the 85th percentile, or weight greater than 120% of ideal for height) who also have any two of the following risk factors: family history of T2D ( rst - or second - degree relative), maternal history of diabetes or GDM during child s gestation, member of high - risk race/ethnicity group (Native American, African -(40 in) in men is considered an indicator of increased\"\n", + " ],\n", + " \"answer\": \"The factors that help predict early onset of diabetes include: 1. **Genetic Factors**: Specific genetic mutations and polymorphisms have been associated with an increased risk of developing type 2 diabetes (T2D) [1], [2], [4], [5], [6], [8]. 2. **Clinical Risk Factors**: Known clinical risk factors such as body mass index (BMI), age, and sex are strong predictors of disease onset [9]. 3. **Family History**: A family history of T2D, particularly in first- or second-degree relatives, is a significant risk factor [10]. 4. **Maternal History**: A maternal history of diabetes or gestational diabetes mellitus (GDM) during the childs gestation increases the risk [10]. 5. **Ethnicity**: Being a member of a high-risk race/ethnicity group, such as Native American or African American, is associated with a higher risk [10]. 6. **Impaired Glucose Levels**: Impaired fasting glucose (IFG), impaired glucose tolerance (IGT), and elevated A1C levels are indicators of increased risk [7]. 7. **Insulin Resistance and Related Conditions**: Conditions such as insulin resistance, hypertension, and dyslipidemia are linked to a higher risk of developing diabetes [4]. These factors collectively contribute to the prediction of early onset diabetes.\",\n", + " \"question\": \"List any factors that help predict early onset of diabetes.\"\n", + "}\n" + ] + } + ], + "source": [ + "print(json.dumps(ragas_output, indent=2))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/gnqa/paper2_eval/src/parse_r2r_result.py b/gnqa/paper2_eval/src/parse_r2r_result.py new file mode 100644 index 0000000..b30f2e7 --- /dev/null +++ b/gnqa/paper2_eval/src/parse_r2r_result.py @@ -0,0 +1,44 @@ +import json +import sys + +read_file = '/data/code/gn-ai/gnqa/paper2_eval/data/rag_out_1.json' + +def iterate_json(obj, thedict): + if isinstance(obj, dict): + for key, val in obj.items(): + if (key == "text"): + thedict["contexts"].append(val.replace("\n", " ").strip()) + print("Key -> {0}\tValue -> {1}".format(key,val)) + elif (key == "metadata"): + thedict["answer"] = val#.replace("\n", " ").strip() + print("Key -> {0}\tValue -> {1}".format(key,val)) + elif (key == "id"): + print("Key -> {0}\tValue -> {1}".format(key,val)) + elif (key == "associatedQuery"): + thedict["question"] = val.replace("\n", " ").strip() + print("Key -> {0}\tValue -> {1}".format(key,val)) + elif (key == "title"): + print("Key -> {0}\tValue -> {1}".format(key,val)) + elif (key == "document_id"): + print("Key -> {0}\tValue -> {1}".format(key,val)) + else: + if (len(obj.items()) == 1 ): + print(key, " --> ", val) + iterate_json(val, thedict) + elif isinstance(obj, list): + for item in obj: + iterate_json(item, thedict) + +# this should be a json file with a list of input files and an output file +with open(read_file, "r") as r_file: + result_file = json.load(r_file) + +ragas_output = { + "contexts": [], + "titles": [], + "answer": "", + "question": ""} +vector_search_results = result_file["vector_search_results"] +iterate_json(vector_search_results, ragas_output) + +print(json.dumps(ragas_output, indent=2))
\ No newline at end of file diff --git a/gnqa/paper2_eval/src/run_questions.py b/gnqa/paper2_eval/src/run_questions.py new file mode 100644 index 0000000..5e0b28b --- /dev/null +++ b/gnqa/paper2_eval/src/run_questions.py @@ -0,0 +1,45 @@ +import json +import sys +import os + +from r2r import R2R, Document, GenerationConfig, R2RClient + + +def writeDatasetFile(responses, outp_file): + print(outp_file) + output = json.dumps(responses, indent=2) + if os.path.exists(outp_file): + with open(outp_file, "a") as the_data: + the_data.write('' + output) + else: + with open(outp_file, "a") as the_data: + the_data.write(output) + +client = R2RClient("http://localhost:8000") + +health_resp = client.health() + +print("The R2R client's health status is {0}".format(health_resp)) + +questions = [ + "List as many studies as you can that include rapamycin.", + "Why is it so difficult to map gene loci that control aging in humans?" +] +rag_response = {} + +ndx = 1 +for question in questions: + rag_response[str(ndx)] = client.rag(question) + ndx = ndx + 1 + +#print(json.dumps(rag_response, indent=2)) + + +try: + read_file = str(sys.argv[1]) + out_file = str(sys.argv[2]) +except: + exit('Example use "python run_questions.py ../data/questions/human/de/aging.json ../data/responses/human/de/aging_resp.json"') + + +writeDatasetFile(rag_response, out_file)
\ No newline at end of file |