From 50f0ed1d717d6877cb0562b1f2d54f0f242312d9 Mon Sep 17 00:00:00 2001 From: ShelbySolomonDarnell Date: Fri, 16 Aug 2024 17:26:14 +0300 Subject: added paper2_eval --- gnqa/paper2_eval/README.md | 6 ++ gnqa/paper2_eval/data/rag_out_1.json | 28 +++++ gnqa/paper2_eval/data/rag_out_2.json | 5 + gnqa/paper2_eval/src/parsejson.py | 63 ++++++++++++ gnqa/paper2_eval/src/retrieve_context.py | 171 +++++++++++++++++++++++++++++++ 5 files changed, 273 insertions(+) create mode 100644 gnqa/paper2_eval/README.md create mode 100644 gnqa/paper2_eval/data/rag_out_1.json create mode 100644 gnqa/paper2_eval/data/rag_out_2.json create mode 100644 gnqa/paper2_eval/src/parsejson.py create mode 100644 gnqa/paper2_eval/src/retrieve_context.py (limited to 'gnqa/paper2_eval') diff --git a/gnqa/paper2_eval/README.md b/gnqa/paper2_eval/README.md new file mode 100644 index 0000000..13cb113 --- /dev/null +++ b/gnqa/paper2_eval/README.md @@ -0,0 +1,6 @@ +# Paper 2 Evaluation + + +This directory contains the code created to evaluate questions submitted to GNQA. +Unlike the evaluation in paper 1, this work uses different LLMs and a different RAG engine. +RAGAS is still used to evaluate the queries. diff --git a/gnqa/paper2_eval/data/rag_out_1.json b/gnqa/paper2_eval/data/rag_out_1.json new file mode 100644 index 0000000..48ca603 --- /dev/null +++ b/gnqa/paper2_eval/data/rag_out_1.json @@ -0,0 +1,28 @@ +{'vector_search_results': + [ + {'id': '7656b48b-d191-516e-9753-d34efedd4812', + 'score': 1.0, + 'metadata': {'text': 'gene interaction and high predictive value, PLoS One 3 (5) (2008) e2031,doi:10.1371/journal.pone.0002031 .\n[107] M. van Hoek, A. Dehghan, J.C. Witteman, C.M. van Duijn, A.G. Uitterlinden, B.A.\nOostra, A. Hofman, E.J. Sijbrands, A.C. Janssens, Predicting type 2 diabetes based\non polymorphisms from genome-wide association studies: a population-based\nstudy, Diabetes 57 (11) (Nov 2008) 3122 3128.\n[108] Q. Lu, Y. Song, X. Wang, S. Won, Y. Cui, R.C. Elston, The effect of multiple genetic', + 'title': '2011 - Annotating individual human genomes.pdf', + 'version': 'v0', + 'chunk_order': 160, + 'document_id': 'f7b5d738-3f0b-5074-9c21-f6b443b4e07f', + 'extraction_id': '80d78615-8424-5478-a01b-73e220bc0345', + 'associatedQuery': 'List any factors that help predict early onset of diabetes.'} + }, + {'id': 'd1af5c82-d226-5980-b5d9-90d7558d1880', + 'score': 1.0, + 'metadata': + { + 'text': 'variants in predicting the risk of type 2 diabetes, BMC Proc 3 (Suppl 7) (Dec 15\n2009) S49.\n[109] K. Miyake, W. Yang, K. Hara, K. Yasuda, Y. Horikawa, H. Osawa, H. Furuta, et al.,\nConstruction of a prediction model for type 2 diabetes mellitus in the Japanese\npopulation based on 11 genes with strong evidence of the association, J. Hum.\nGenet. 54 (4) (Apr 2009) 236 241 [Epub 2009 Feb 27].\n[110] P.J. Talmud, A.D. Hingorani, J.A. Cooper, M.G. Marmot, E.J. Brunner, M. Kumari, M.', 'title': '2011 - Annotating individual human genomes.pdf', + 'version': 'v0', + 'chunk_order': 161, + 'document_id': 'f7b5d738-3f0b-5074-9c21-f6b443b4e07f', + 'extraction_id': '80d78615-8424-5478-a01b-73e220bc0345', + 'associatedQuery': 'List any factors that help predict early onset of diabetes.' + } + }, + {'id': 'bb2a67ec-135b-5d55-b33d-74b1dc085685', 'score': 1.0, 'metadata': {'text': 'type 2 diabetes risk, Diabetes 57 (11) (Nov 2008) 3129 3135.\n[103] Q. Lu, R.C. Elston, Using the optimal receiver operating characteristic curve to\ndesign a predictive genetic test, exempli ed with type 2 diabetes, Am. J. Hum.\nGenet. 82 (3) (Mar 2008) 641 651.\n[104] V. Lyssenko, A. Jonsson, P. Almgren, N. Pulizzi, B. Isomaa, T. Tuomi, G. Berglund, D.\nAltshuler, P. Nilsson, L. Groop, Clinical risk factors, DNA variants, and the\ndevelopment of type 2 diabetes, N. Engl. J. Med. 359 (21) (Nov 20 2008)', 'title': '2011 - Annotating individual human genomes.pdf', 'version': 'v0', 'chunk_order': 158, 'document_id': 'f7b5d738-3f0b-5074-9c21-f6b443b4e07f', 'extraction_id': '80d78615-8424-5478-a01b-73e220bc0345', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': '11fc663d2-2833-51e7-ae6a-55b007a6e27c', 'score': 1.0, 'metadata': {'text': 'insulin resistance, hypertension, and dyslipidemia (Obesity Education Initiative Expert Panel, 1998 ). Insulin resist-ance increases with age, and the incidence of diabetes rises sharply in the elderly (American Diabetes Association, 2010a ). \n In a few patients, genetic mutations appear to be associ-\nated with T2D (Roche et al. , 2005 ; American Diabetes \nAssociation, 2010a ). For example, recent work using the DPP data has led to the identi cation of 27 single nucle-', 'title': '2012 - Systems Biology Approaches to Nutrition.pdf', 'version': 'v0', 'chunk_order': 9596, 'document_id': '6955478b-950d-5d29-b24c-3a5ca656f3ae', 'extraction_id': 'eb3de845-98db-505c-bb7f-c0f3259875fc', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': 'bb55a705-7399-550e-8285-07c33654b909', 'score': 1.0, 'metadata': {'text': '19. Permutt MA, Wasson J, Cox N: Genetic epidemiology of diabe-\ntes. J Clin Invest 2005, 115:1431-1439.\n20. Barroso I: Genetics of Type 2 diabetes. Diabet Med 2005,\n22:517-535.\n21. Parikh H, Groop L: Candidate genes for type 2 diabetes. Rev\nEndocr Metab Disord 2004, 5:151-176.\n22. Lohmueller KE, Pearce CL, Pike M, Lander ES, Hirschhorn JN: Meta-\nanalysis of genetic association studies supports a contribu-\ntion of common variants to su sceptibility to common dis-\nease. Nat Genet 2003, 33:177-182.', 'title': '2006 - β2-adrenergic receptor and UCP3 variants modulate the relationship between age and type 2 diabetes mellitus.pdf', 'version': 'v0', 'chunk_order': 86, 'document_id': '0ea34c04-5d09-5a32-89a7-c3add179927a', 'extraction_id': 'acf69ed8-c7b0-5d9f-8005-de020c9cf699', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': '9bff43c0-fd12-572e-9996-24957edd17d2', 'score': 1.0, 'metadata': {'text': 'insulin-dependent diabetes and schizophrenia, twin studies have demon-strated the existence of a significant genetic component (Kyvik et al., 1995;Plomin et al., 1994). Genetic factors also influence cardiovascular diseaseswhich occur in early or midlife, while for cardiovascular diseases occur-ring late in life there is little evidence of a genetic effect (Marenberg et al.,1994). Dementia has a very strong genetic component, not only withregard to early-onset monogenic types but also to late-onset', 'title': '2001 - Demography in the age of genomics.pdf', 'version': 'v0', 'chunk_order': 452, 'document_id': '0f07fa43-feb6-5656-b7e7-b8faa86f5623', 'extraction_id': '5f24a851-1de6-5b6e-8230-2da08806b01a', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': '2df84ccc-0d32-582e-bda6-9cd46bee5378', 'score': 1.0, 'metadata': {'text': 'Three categories of increased risk of developing diabetes \nare currently recognized by the ADA: an FPG between 5.6 and 6.9 mmol/L (100 and 125 mg/dL), de ned as having \nimpaired fasting glucose (IFG); a 2 - h OGTT between 7.8 and 11 mmol/L (140 and 199 mg/dL), de ned as having \nimpaired glucose tolerance (IGT); an A1C between 5.7 and 6.4% with values between 6.0 and 6.4 considered very high risk (American Diabetes Association, 2010a ). \n It is estimated that approximately one - fourth of indi-', 'title': '2012 - Systems Biology Approaches to Nutrition.pdf', 'version': 'v0', 'chunk_order': 9590, 'document_id': '6955478b-950d-5d29-b24c-3a5ca656f3ae', 'extraction_id': 'eb3de845-98db-505c-bb7f-c0f3259875fc', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': 'ff30f187-d5c3-5d01-8026-0588a77e9f44', 'score': 1.0, 'metadata': {'text': '20 90 D20S451 0.006 10.7 5.4 (34) 8.42 5.4 (61) 0.30 (long duration)\nInteraction with age at diagnosis of diabetes\n19 1 D1S1665 0.004 37.4 8.1 (66) 41.2 8.3 (81) 0.23 (early onset)\n2 159 D2S1399/D2S1353 0.023 40.8 8.2 (53) 38.8 8.5 (94) 0.16 (late onset)\n3 135 D3S2460 0.036 37.7 8.6 (66) 41.0 8.0 (81) 0.16 (early onset)\n4 146 D4S1625 0.005 37.9 7.4 (52) 40.4 8.9 (95) 0.23 (early onset)\n65 5 D6S2427 0.024 38.0 7.8 (70) 40.9 8.8 (77) 0.18 (early onset)', 'title': '2004 - A genome scan for diabetic nephropathy in African Americans.pdf', 'version': 'v0', 'chunk_order': 64, 'document_id': '5798fb6b-b3e6-57c4-9823-5428853dbfa1', 'extraction_id': '01ec7832-8a80-5f5e-aa26-3648f572c4a1', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': '57dc1ee5-4252-52c3-92cb-e2ac36cdc4d6', 'score': 1.0, 'metadata': {'text': 'prevention, even though there are great concerns about how such\ninformation will be perceived and utilized [68]. Recent studies in\ndiabetes and cardiovascular disease suggest that the addition of\npreviously associated genetic risk loci in clinical risk models of those\ndiseases increases both discriminative and predictive accuracy, albeit\nonly marginally. Typically, the strongest predictors of disease onset\nare known clinical risk factors such as body mass index, age, or', 'title': '2011 - Annotating individual human genomes.pdf', 'version': 'v0', 'chunk_order': 66, 'document_id': 'f7b5d738-3f0b-5074-9c21-f6b443b4e07f', 'extraction_id': 'c17c74fa-12a3-5072-bb48-c179055db14d', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': 'ff801099-e737-57b1-91af-a4cea20adb87', 'score': 1.0, 'metadata': {'text': 'and sex, weight for height greater than the 85th percentile, or weight greater than 120% of ideal for height) who also have any two of the following risk factors: family history of T2D ( rst - or second - degree relative), maternal history \nof diabetes or GDM during child s gestation, member of high - risk race/ethnicity group (Native American, African -(40 in) in men is considered an indicator of increased', 'title': '2012 - Systems Biology Approaches to Nutrition.pdf', 'version': 'v0', 'chunk_order': 9595, 'document_id': '6955478b-950d-5d29-b24c-3a5ca656f3ae', 'extraction_id': 'eb3de845-98db-505c-bb7f-c0f3259875fc', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}], 'kg_search_results': None}, + +{'id': 'chatcmpl-9wP7fPKqhM05QleyuBp1k6DB74T0x', 'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'content': "The factors that help predict early onset of diabetes include:\n\n1. **Genetic Factors**: Specific genetic mutations and polymorphisms have been associated with an increased risk of developing type 2 diabetes (T2D) [1], [2], [4], [5], [6], [8].\n2. **Clinical Risk Factors**: Known clinical risk factors such as body mass index (BMI), age, and sex are strong predictors of disease onset [9].\n3. **Family History**: A family history of T2D, particularly in first- or second-degree relatives, is a significant risk factor [10].\n4. **Maternal History**: A maternal history of diabetes or gestational diabetes mellitus (GDM) during the child's gestation increases the risk [10].\n5. **Ethnicity**: Being a member of a high-risk race/ethnicity group, such as Native American or African American, is associated with a higher risk [10].\n6. **Impaired Glucose Levels**: Impaired fasting glucose (IFG), impaired glucose tolerance (IGT), and elevated A1C levels are indicators of increased risk [7].\n7. **Insulin Resistance and Related Conditions**: Conditions such as insulin resistance, hypertension, and dyslipidemia are linked to a higher risk of developing diabetes [4].\n\nThese factors collectively contribute to the prediction of early onset diabetes.", 'role': 'assistant'}}], 'created': 1723706623, 'model': 'gpt-4o-2024-05-13', 'object': 'chat.completion', 'system_fingerprint': 'fp_3aa7262c27', 'usage': {'completion_tokens': 284, 'prompt_tokens': 1723, 'total_tokens': 2007}} +Time taken for RAG: 14.26 seconds diff --git a/gnqa/paper2_eval/data/rag_out_2.json b/gnqa/paper2_eval/data/rag_out_2.json new file mode 100644 index 0000000..b14bfbf --- /dev/null +++ b/gnqa/paper2_eval/data/rag_out_2.json @@ -0,0 +1,5 @@ +Search Results: +{'vector_search_results': [{'id': '7656b48b-d191-516e-9753-d34efedd4812', 'score': 0.6306634325184495, 'metadata': {'text': 'gene interaction and high predictive value, PLoS One 3 (5) (2008) e2031,doi:10.1371/journal.pone.0002031 .\n[107] M. van Hoek, A. Dehghan, J.C. Witteman, C.M. van Duijn, A.G. Uitterlinden, B.A.\nOostra, A. Hofman, E.J. Sijbrands, A.C. Janssens, Predicting type 2 diabetes based\non polymorphisms from genome-wide association studies: a population-based\nstudy, Diabetes 57 (11) (Nov 2008) 3122 3128.\n[108] Q. Lu, Y. Song, X. Wang, S. Won, Y. Cui, R.C. Elston, The effect of multiple genetic', 'title': '2011 - Annotating individual human genomes.pdf', 'version': 'v0', 'chunk_order': 160, 'document_id': 'f7b5d738-3f0b-5074-9c21-f6b443b4e07f', 'extraction_id': '80d78615-8424-5478-a01b-73e220bc0345', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': 'd1af5c82-d226-5980-b5d9-90d7558d1880', 'score': 0.6210695956862499, 'metadata': {'text': 'variants in predicting the risk of type 2 diabetes, BMC Proc 3 (Suppl 7) (Dec 15\n2009) S49.\n[109] K. Miyake, W. Yang, K. Hara, K. Yasuda, Y. Horikawa, H. Osawa, H. Furuta, et al.,\nConstruction of a prediction model for type 2 diabetes mellitus in the Japanese\npopulation based on 11 genes with strong evidence of the association, J. Hum.\nGenet. 54 (4) (Apr 2009) 236 241 [Epub 2009 Feb 27].\n[110] P.J. Talmud, A.D. Hingorani, J.A. Cooper, M.G. Marmot, E.J. Brunner, M. Kumari, M.', 'title': '2011 - Annotating individual human genomes.pdf', 'version': 'v0', 'chunk_order': 161, 'document_id': 'f7b5d738-3f0b-5074-9c21-f6b443b4e07f', 'extraction_id': '80d78615-8424-5478-a01b-73e220bc0345', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': 'bb2a67ec-135b-5d55-b33d-74b1dc085685', 'score': 0.6134476661682129, 'metadata': {'text': 'type 2 diabetes risk, Diabetes 57 (11) (Nov 2008) 3129 3135.\n[103] Q. Lu, R.C. Elston, Using the optimal receiver operating characteristic curve to\ndesign a predictive genetic test, exempli ed with type 2 diabetes, Am. J. Hum.\nGenet. 82 (3) (Mar 2008) 641 651.\n[104] V. Lyssenko, A. Jonsson, P. Almgren, N. Pulizzi, B. Isomaa, T. Tuomi, G. Berglund, D.\nAltshuler, P. Nilsson, L. Groop, Clinical risk factors, DNA variants, and the\ndevelopment of type 2 diabetes, N. Engl. J. Med. 359 (21) (Nov 20 2008)', 'title': '2011 - Annotating individual human genomes.pdf', 'version': 'v0', 'chunk_order': 158, 'document_id': 'f7b5d738-3f0b-5074-9c21-f6b443b4e07f', 'extraction_id': '80d78615-8424-5478-a01b-73e220bc0345', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': '9fc663d2-2833-51e7-ae6a-55b007a6e27c', 'score': 0.5998189449310303, 'metadata': {'text': 'insulin resistance, hypertension, and dyslipidemia (Obesity Education Initiative Expert Panel, 1998 ). Insulin resist-ance increases with age, and the incidence of diabetes rises sharply in the elderly (American Diabetes Association, 2010a ). \n In a few patients, genetic mutations appear to be associ-\nated with T2D (Roche et al. , 2005 ; American Diabetes \nAssociation, 2010a ). For example, recent work using the DPP data has led to the identi cation of 27 single nucle-', 'title': '2012 - Systems Biology Approaches to Nutrition.pdf', 'version': 'v0', 'chunk_order': 9596, 'document_id': '6955478b-950d-5d29-b24c-3a5ca656f3ae', 'extraction_id': 'eb3de845-98db-505c-bb7f-c0f3259875fc', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': 'bb55a705-7399-550e-8285-07c33654b909', 'score': 0.5985058546066284, 'metadata': {'text': '19. Permutt MA, Wasson J, Cox N: Genetic epidemiology of diabe-\ntes. J Clin Invest 2005, 115:1431-1439.\n20. Barroso I: Genetics of Type 2 diabetes. Diabet Med 2005,\n22:517-535.\n21. Parikh H, Groop L: Candidate genes for type 2 diabetes. Rev\nEndocr Metab Disord 2004, 5:151-176.\n22. Lohmueller KE, Pearce CL, Pike M, Lander ES, Hirschhorn JN: Meta-\nanalysis of genetic association studies supports a contribu-\ntion of common variants to su sceptibility to common dis-\nease. Nat Genet 2003, 33:177-182.', 'title': '2006 - β2-adrenergic receptor and UCP3 variants modulate the relationship between age and type 2 diabetes mellitus.pdf', 'version': 'v0', 'chunk_order': 86, 'document_id': '0ea34c04-5d09-5a32-89a7-c3add179927a', 'extraction_id': 'acf69ed8-c7b0-5d9f-8005-de020c9cf699', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': '9bff43c0-fd12-572e-9996-24957edd17d2', 'score': 0.5946860555001475, 'metadata': {'text': 'insulin-dependent diabetes and schizophrenia, twin studies have demon-strated the existence of a significant genetic component (Kyvik et al., 1995;Plomin et al., 1994). Genetic factors also influence cardiovascular diseaseswhich occur in early or midlife, while for cardiovascular diseases occur-ring late in life there is little evidence of a genetic effect (Marenberg et al.,1994). Dementia has a very strong genetic component, not only withregard to early-onset monogenic types but also to late-onset', 'title': '2001 - Demography in the age of genomics.pdf', 'version': 'v0', 'chunk_order': 452, 'document_id': '0f07fa43-feb6-5656-b7e7-b8faa86f5623', 'extraction_id': '5f24a851-1de6-5b6e-8230-2da08806b01a', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': '2df84ccc-0d32-582e-bda6-9cd46bee5378', 'score': 0.5944506525993347, 'metadata': {'text': 'Three categories of increased risk of developing diabetes \nare currently recognized by the ADA: an FPG between 5.6 and 6.9 mmol/L (100 and 125 mg/dL), de ned as having \nimpaired fasting glucose (IFG); a 2 - h OGTT between 7.8 and 11 mmol/L (140 and 199 mg/dL), de ned as having \nimpaired glucose tolerance (IGT); an A1C between 5.7 and 6.4% with values between 6.0 and 6.4 considered very high risk (American Diabetes Association, 2010a ). \n It is estimated that approximately one - fourth of indi-', 'title': '2012 - Systems Biology Approaches to Nutrition.pdf', 'version': 'v0', 'chunk_order': 9590, 'document_id': '6955478b-950d-5d29-b24c-3a5ca656f3ae', 'extraction_id': 'eb3de845-98db-505c-bb7f-c0f3259875fc', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': 'ff30f187-d5c3-5d01-8026-0588a77e9f44', 'score': 0.5909770727157593, 'metadata': {'text': '20 90 D20S451 0.006 10.7 5.4 (34) 8.42 5.4 (61) 0.30 (long duration)\nInteraction with age at diagnosis of diabetes\n19 1 D1S1665 0.004 37.4 8.1 (66) 41.2 8.3 (81) 0.23 (early onset)\n2 159 D2S1399/D2S1353 0.023 40.8 8.2 (53) 38.8 8.5 (94) 0.16 (late onset)\n3 135 D3S2460 0.036 37.7 8.6 (66) 41.0 8.0 (81) 0.16 (early onset)\n4 146 D4S1625 0.005 37.9 7.4 (52) 40.4 8.9 (95) 0.23 (early onset)\n65 5 D6S2427 0.024 38.0 7.8 (70) 40.9 8.8 (77) 0.18 (early onset)', 'title': '2004 - A genome scan for diabetic nephropathy in African Americans.pdf', 'version': 'v0', 'chunk_order': 64, 'document_id': '5798fb6b-b3e6-57c4-9823-5428853dbfa1', 'extraction_id': '01ec7832-8a80-5f5e-aa26-3648f572c4a1', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': '57dc1ee5-4252-52c3-92cb-e2ac36cdc4d6', 'score': 0.5907666927119091, 'metadata': {'text': 'prevention, even though there are great concerns about how such\ninformation will be perceived and utilized [68]. Recent studies in\ndiabetes and cardiovascular disease suggest that the addition of\npreviously associated genetic risk loci in clinical risk models of those\ndiseases increases both discriminative and predictive accuracy, albeit\nonly marginally. Typically, the strongest predictors of disease onset\nare known clinical risk factors such as body mass index, age, or', 'title': '2011 - Annotating individual human genomes.pdf', 'version': 'v0', 'chunk_order': 66, 'document_id': 'f7b5d738-3f0b-5074-9c21-f6b443b4e07f', 'extraction_id': 'c17c74fa-12a3-5072-bb48-c179055db14d', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}, {'id': 'ff801099-e737-57b1-91af-a4cea20adb87', 'score': 0.58806312084198, 'metadata': {'text': 'and sex, weight for height greater than the 85th percentile, or weight greater than 120% of ideal for height) who also have any two of the following risk factors: family history of T2D ( rst - or second - degree relative), maternal history \nof diabetes or GDM during child s gestation, member of high - risk race/ethnicity group (Native American, African -(40 in) in men is considered an indicator of increased', 'title': '2012 - Systems Biology Approaches to Nutrition.pdf', 'version': 'v0', 'chunk_order': 9595, 'document_id': '6955478b-950d-5d29-b24c-3a5ca656f3ae', 'extraction_id': 'eb3de845-98db-505c-bb7f-c0f3259875fc', 'associatedQuery': 'List any factors that help predict early onset of diabetes.'}}], 'kg_search_results': None} +Completion: +{'id': 'chatcmpl-9wRnHpkyuzTtAXaZGr0qT373xjaOO', 'choices': [{'finish_reason': 'stop', 'index': 0, 'message': {'content': "The factors that help predict the early onset of diabetes include:\n\n1. **Genetic Factors**: Specific genetic mutations and polymorphisms have been associated with an increased risk of developing diabetes. For example, studies have identified multiple genetic variants that can predict the risk of type 2 diabetes [1], [2], [4], [5], [6].\n\n2. **Clinical Risk Factors**: Known clinical risk factors such as body mass index (BMI), age, and sex are strong predictors of disease onset [9].\n\n3. **Family History**: A family history of type 2 diabetes (T2D), particularly in first- or second-degree relatives, is a significant risk factor [10].\n\n4. **Maternal History of Diabetes**: A maternal history of diabetes or gestational diabetes mellitus (GDM) during the child's gestation increases the risk [10].\n\n5. **Race/Ethnicity**: Being a member of a high-risk race/ethnicity group, such as Native American or African American, is associated with a higher risk of developing diabetes [10].\n\n6. **Impaired Glucose Tolerance and Impaired Fasting Glucose**: Individuals with impaired fasting glucose (IFG) or impaired glucose tolerance (IGT) are at increased risk of developing diabetes [7].\n\n7. **Insulin Resistance, Hypertension, and Dyslipidemia**: These conditions are associated with an increased risk of diabetes, particularly as they often occur together in metabolic syndrome [4].\n\n8. **Age at Diagnosis**: There is an interaction between genetic factors and the age at diagnosis, with certain genetic markers being more strongly associated with early-onset diabetes [8].\n\nThese factors collectively help in predicting the early onset of diabetes.", 'role': 'assistant'}}], 'created': 1723716891, 'model': 'gpt-4o-2024-05-13', 'object': 'chat.completion', 'system_fingerprint': 'fp_3aa7262c27', 'usage': {'completion_tokens': 353, 'prompt_tokens': 1723, 'total_tokens': 2076}} +Time taken for RAG: 9.47 seconds diff --git a/gnqa/paper2_eval/src/parsejson.py b/gnqa/paper2_eval/src/parsejson.py new file mode 100644 index 0000000..b49a898 --- /dev/null +++ b/gnqa/paper2_eval/src/parsejson.py @@ -0,0 +1,63 @@ +import json +import sys + + +def iterate_json(obj, thedict): + if isinstance(obj, dict): + for key, val in obj.items(): + if (key == "text"): + thedict["contexts"].append(val.replace("\n", " ").strip()) + elif (key == "answer"): + thedict["answer"] = val.replace("\n", " ").strip() + elif (key == "question"): + thedict["question"] = val.replace("\n", " ").strip() + else: + if (len(obj.items()) == 1 ): + print(key, " --> ", val) + iterate_json(val, thedict) + elif isinstance(obj, list): + for item in obj: + iterate_json(item, thedict) + +def create_dataset_from_files(tag, file_name, rag_out): + for the_file in file_name[tag]: + ragas_output = { + "contexts": [], + "answer": "", + "question": ""} + #print(the_file) + with open("./data/"+the_file, "r") as r_file: + data_file = json.load(r_file) + iterate_json(data_file, ragas_output) + rag_out["answer"].append(ragas_output["answer"]) + rag_out["question"].append(ragas_output["question"]) + rag_out["contexts"].append(ragas_output["contexts"]) + +def create_resultset_from_file(file_name): + with open("./data/"+the_file, "r") as r_file: + data_file = json.load(r_file) + iterate_json(data_file, ragas_output) + + +file_list_tag = str(sys.argv[1]) +read_file = str(sys.argv[2]) # e.g. doc_list.json +outp_file = str(sys.argv[3]) + +rag_out = { + "question": [], + "answer": [], + "contexts": [] +} + +cntxt_lst = [] + +# this should be a json file with a list of input files and an output file +with open(read_file, "r") as r_file: + file_lst = json.load(r_file) + +create_dataset_from_files(file_list_tag, file_lst, rag_out) + +with open(outp_file, "a") as the_data: + #json.dump(ragas_output, the_data) + the_data.write(",\n") + the_data.write(json.dumps(rag_out, indent=2)) diff --git a/gnqa/paper2_eval/src/retrieve_context.py b/gnqa/paper2_eval/src/retrieve_context.py new file mode 100644 index 0000000..58b9d47 --- /dev/null +++ b/gnqa/paper2_eval/src/retrieve_context.py @@ -0,0 +1,171 @@ +import os +import sys +import json +import time +import configparser +import apis.process as gnqa +from apis.process import get_gnqa, get_response_from_taskid + + +config = configparser.ConfigParser() +config.read('_config.cfg') + +''' +the refs object is a list of items containing doc_id, bibInfo, and comboTxt +We only need comboTxt +''' +def simplifyContext(refs): + result = [] + for item in refs: + combo_text = item['comboTxt'] + combo_text = combo_text.replace('\n','') + combo_text = combo_text.replace('\t','') + result.append(combo_text) + return result + +def writeDatasetFile(responses, outp_file): + print(outp_file) + output = json.dumps(responses, indent=2) + if os.path.exists(outp_file): + with open(outp_file, "a") as the_data: + the_data.write('' + output) + else: + with open(outp_file, "a") as the_data: + the_data.write(output) + + +def reset_responses(): + return { + 'question': [], + 'answer': [], + 'contexts': [], + 'task_id': [] + } + +def parse_document(jsonfile): + print('Parse document') + for item in jsonfile: + level = item['level'] + domain = item['domain'] + query_lst = item['query'] + create_datasets(query_lst, domain, level) + +def create_datasets(query_list, domain, level): + print('Creating dataset') + responses = reset_responses() + ndx = 0 + for query in query_list: + print(query) + task_id, answer, refs = get_gnqa(query, config['key.api']['fahamuai'], config['DEFAULT']['DATA_DIR']) + responses['question'].append(query) + responses['answer'].append(answer) + responses['task_id'].append(task_id) + responses['contexts'].append(simplifyContext(refs)) + ndx+=1 + time.sleep(10) # sleep a bit to not overtask the api + if ndx % 5 == 0: + print('Will print to file number {0}'.format(int(ndx/5))) + outp_file = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['gpt4o_dir'],level,domain,str(int(ndx/5))) + writeDatasetFile(responses, outp_file) + responses = reset_responses() + if len(responses['question']) > 0: + outp_file = '{0}dataset_{1}_{2}_{3}.json'.format(config['out.response.dataset']['gpt4o_dir'],level,domain,str(int(ndx/5)+1)) + writeDatasetFile(responses, outp_file) + +def parse_responses(jsonfile): + print('Parsing human responses') + de_dict_general = {"level": "domainexpert", "domain": "general", "query": [], "task_id": []} + de_dict_aging = {"level": "domainexpert", "domain": "aging", "query": [], "task_id": []} + de_dict_diabetes = {"level": "domainexpert", "domain": "diabetes", "query": [], "task_id": []} + cs_dict_general = {"level": "citizenscientist", "domain": "general", "query": [], "task_id": []} + cs_dict_aging = {"level": "citizenscientist", "domain": "aging", "query": [], "task_id": []} + cs_dict_diabetes = {"level": "citizenscientist", "domain": "diabetes", "query": [], "task_id": []} + j = 0 + for _, val in jsonfile.items(): + ndx = 0 + lvl = val.get("level") + for qry in val.get("query"): + ans = val.get("answer")[ndx] if "answer" in val else "" + tpc = val.get("topic")[ndx] + tpc = "general" if tpc==0 else "aging" if tpc==1 else "diabetes" + tskd = val.get("task_id")[ndx] + if lvl == 'cs' and tpc == 'general': + addToDataList(cs_dict_general, qry, ans, tskd) + elif lvl == 'cs' and tpc == 'aging': + addToDataList(cs_dict_aging, qry, ans, tskd) + elif lvl == 'cs' and tpc == 'diabetes': + addToDataList(cs_dict_diabetes, qry, ans, tskd) + elif lvl == 'de' and tpc == 'general': + addToDataList(de_dict_general, qry, ans, tskd) + elif lvl == 'de' and tpc == 'aging': + addToDataList(de_dict_aging, qry, ans, tskd) + elif lvl == 'de' and tpc == 'diabetes': + addToDataList(de_dict_diabetes, qry, ans, tskd) + else: + print('Somehow there is a query without a topic or expertise level') + ndx+=1 + j+=1 + create_datasets_from_taskid(de_dict_general) + create_datasets_from_taskid(de_dict_aging) + create_datasets_from_taskid(de_dict_diabetes) + create_datasets_from_taskid(cs_dict_general) + create_datasets_from_taskid(cs_dict_aging) + create_datasets_from_taskid(cs_dict_diabetes) + +def addToDataList(data_lst, qry, ans, tskd): + data_lst["query"].append(qry) + data_lst["task_id"].append(tskd) + if "answer" not in data_lst.keys(): + data_lst["answer"] = [] + data_lst["answer"].append(ans) + + +def create_datasets_from_taskid(info_dict):#task_list, query_list, answers, domain, level): + print('Creating dataset of questions from {0} in the topic of {1}'.format(info_dict["level"], info_dict["domain"])) + responses = reset_responses() + ndx = 0 + query_list = info_dict["query"] + if "answer" in info_dict: + answers = info_dict["answer"] + else: + info_dict["answer"] = [] + answers = [] + + for task_id in info_dict["task_id"]: + _, an_answer, refs = get_response_from_taskid(config['key.api']['fahamuai'], task_id) + responses['question'].append(query_list[ndx]) + if answers[ndx] == "": + responses['answer'].append(an_answer) + else: + responses['answer'].append(answers[ndx]) + responses['task_id'].append(task_id) + responses['contexts'].append(simplifyContext(refs)) + ndx+=1 + time.sleep(10) # sleep a bit to not overtask the api + if ndx % 5 == 0: + #print('Will print to file number {0}'.format(int(ndx/5))) + outp_file = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5))) + writeDatasetFile(responses, outp_file) + responses = reset_responses() + if len(responses['question']) > 0: + #print('Will print to file number {0}'.format(int((ndx/5)+1))) + #print(responses) + outp_file = '{0}dataset_{1}_{2}_{3}_two.json'.format(config['out.response.dataset']['human_dir'],info_dict["level"],info_dict["domain"],str(int(ndx/5)+1)) + writeDatasetFile(responses, outp_file) + +try: + + read_file = str(sys.argv[1]) + file_type = str(sys.argv[2]) + +except: + exit('Example use "python3 retrieve_context.py data/queries/qlist.json human/gpt4o"') + + +print('Read input file') +with open(read_file, "r") as r_file: + file_lst = json.load(r_file) +if file_type == "gpt4o": + parse_document(file_lst) +else: + parse_responses(file_lst) \ No newline at end of file -- cgit v1.2.3