aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShelbySolomonDarnell2024-09-24 23:40:11 +0300
committerShelbySolomonDarnell2024-09-24 23:40:11 +0300
commitcb28feac47dc1f6147260f1bc057970c54b314f3 (patch)
tree6c45082bc4d723468609f774170339c718576873
parentea942f68346abcd6e51d1cc96b0c90361c3cdfa2 (diff)
downloadgn-ai-cb28feac47dc1f6147260f1bc057970c54b314f3.tar.gz
Human questions in json format, code for formatting r2r response
-rw-r--r--gnqa/paper2_eval/data/human_questions.json172
-rw-r--r--gnqa/paper2_eval/data/testresp.json324
-rw-r--r--gnqa/paper2_eval/src/parse_r2r_result.ipynb262
-rw-r--r--gnqa/paper2_eval/src/parse_r2r_result.py44
-rw-r--r--gnqa/paper2_eval/src/run_questions.py45
5 files changed, 847 insertions, 0 deletions
diff --git a/gnqa/paper2_eval/data/human_questions.json b/gnqa/paper2_eval/data/human_questions.json
new file mode 100644
index 0000000..4142e5b
--- /dev/null
+++ b/gnqa/paper2_eval/data/human_questions.json
@@ -0,0 +1,172 @@
+[
+ {
+ "level": "domainexpert",
+ "domain": "gn",
+ "query": [
+ "What are the potential benefits and risk associated with gene editing technologies like CRISPRR-Cas9?",
+ "How does epigenetics inluence gene expression without changing the underlying DNA sequence?",
+ "Describe the role of mitochondrial DNA in heredity and how it differs from nuclear DNA.",
+ "What are the ethical considerations surrounding prenatal genetic testing and the selective termination of pregnancies based on genetic factors?",
+ "Create a how-to guide for genetic sequencing.",
+
+ "Which genes give a predisposition to developing T1D?",
+ "What is ensembl",
+ "Which database can I use for genetic, genomics, phenotype, and disease-related data generated from rat research?",
+ "What is RGD?",
+ "What resources can I use to do pathway analyses?",
+
+ "Once a sperm combines with an egg, what determines how traits are passed onto the resulting lifeform?",
+ "Why is genetic tracing matrilineal rather than patrilineal?",
+ "Explain the process of DNA replication and how it ensures accurate copying of genetic information during cell division.",
+ "What are the potential benefits and risks associated with gene editing technologies like CRISPR-Cas9?",
+ "How does one tell the difference between X and Y DNA, with repsect to DNA tracing and determining QTLs?",
+
+
+ "For text and biological resources, do you mean add some books (on biology stuff) or/and web resources (as ensembl) on your system?",
+ "what is ensembl?",
+ "What is the difference between QTL mapping and GWAS?",
+ "How do I determine which gene in my QTL is causal for the trait?",
+ "Why do males have two Y chromosomes and females only one?",
+
+ "How does one tell the difference between X and Y DNA, with respect to DNA tracing and determining QTLs",
+ "Once a sperm combines with an egg, what determines how traits are passed onto the resulting lifeform?",
+ "How can I add a new species to the GeneNetwork database?",
+ "which genes are typically associated with diabetes in QTL analyses?",
+ "In which diseases is the gene TCF7L2 involved?",
+
+ "Once a sperm combines with an egg, what determines how traits are passed onto the resulting lifeform?",
+ "Can you explain what a ribosomal binding site at a high level and make it accessable to a non-expert?",
+ "Once a sperm combines with an egg, what determines how traits are passed onto the resulting lifeform?",
+ "Can you explain the difference between sequencing with short reads vs long reads? Please make you answer accessible to a non-expert.",
+ "Can you explain why using a pangenome-based reference might be more useful than simply using a single linear reference? Please make you answer accessible to a non-expert.",
+
+ "Is all genetic regulation done through DNA (e.g., prompters, repressors, activators) or are there other forms of genetic regulation? Please make you answer accessible to a non-expert.",
+ "What are the different relationship between traits?",
+ "Can landscape of QTL and GWAS hits be used to find relationships between traits?"
+ ]
+ },
+ {
+ "level": "domainexpert",
+ "domain": "diabetes",
+ "query": [
+ "How is gene expression in the liver affected by diabetes?",
+ "Is any of the genes SH2B3, IFIH1 or ERBB3 related to diabetes?",
+ "nutrition is a factor for diabetes. how can genomics be use to better understand nutritional factors of diabetes",
+ "nutrition is a factor for diabetes. construct an abstract about how can genomics be use to better understand nutritional factors of diabetes",
+ "Is the gene TCF7L2 involved in diabetes?",
+
+ "Is any of the genes SH2B3, IFIH1 or ERBB3 related to diabetes?",
+ "How can I use genenetwork to find genes related with diabetes in humans?",
+ "How can I use the GeneNetwork tool to find genes related with diabetes in humans?",
+ "what are confounding factors in diabetes?",
+ "How is the immune system related to diabetes?",
+
+ "What are the genomic variants associated with immune system components and diabetes?",
+ "What is the role of the immune system in the metabolomics of diabetes and associated conditions?",
+ "Can the landscape of QTL and GWAS hits be used to dissect the role of immune system in diabetes and complications?"
+
+ ]
+ },
+ {
+ "level": "domainexpert",
+ "domain": "aging",
+ "query": [
+ "What is the significance of the length of telomeres?",
+ "Which mouse genes have been associated with longevity?",
+ "what genetic factor are associated with aging",
+ "which genes are typically associated with early aging?",
+ "How do I generate a linkage or association mapping study in mice to understand aging?",
+ "is there a specific genetic variation that can cause someone to live longer? please make your answer accessible to a non-expert"
+ ]
+ },
+ {
+ "level": "citizenscientist",
+ "domain": "gn",
+ "query": [
+ "What is the most cited environmental factor for the onset of asthma?",
+ "How would one extract the DNA, from say, flora or fauna?","genetics",
+ "what is bioinformatics",
+ "Explain the process of finding a genetic marker followed by a quantitative trait loci.",
+
+ "What about recombination in human centromeres?",
+ "How does recombination work in human centromeres?",
+ "What about recombination in the human genome?",
+ "Create a how to guide for genetic sequencing",
+ "What is the significance of the length of telomeres? ",
+
+ "Once a sperm combines with an egg, what determines how traits are passed on to the resulting lifeform? ",
+ "Why is genetic tracing matrilineal rather than patrilineal? ",
+ "How does one tell the difference between X and Y DNA, with respect to DNA tracing and determining QTLs?",
+ "what type of dataset is useful for qtl mapping analysis in genenetwork2?",
+ "what are the bioinformatics tools for QTLs analysis?",
+
+ "what are the statistical approaches for qtls analysis?",
+ "Create a how-to guide for GWAS analysis?",
+ "Create a how-to guide for genetic sequencing",
+ "Create a how-to guide for genetic sequencing.",
+ "What is the significance of the length of telomeres?",
+
+ "Create a how-to guide for genetic sequencing",
+ "Create a guide for genetic sequencing",
+ "Define dyslipidemia.",
+ "What is cytochrome?",
+ "How does one tell the difference between X and Y DNA, with respect to DNA tracing and determining QTLs?",
+
+ "how does environment influence fertilisation",
+ "how does diet impact someone's height",
+ "which animal has the same number of chromosomes as human",
+ "what ensures brains work",
+ "how do our brains maintain emotions",
+
+ "what hormones do our brains release during stressful experiences?",
+ "what is the use of corticosterone?"
+ ]
+ },
+ {
+ "level": "citizenscientist",
+ "domain": "diabetes",
+ "query": [
+ "What are the genetic bases for the varying efficacy of diabetes treatments among individuals?",
+ "Explain Protective Genetic Factors Against Diabetes in Elderly Populations",
+ "Explain Effect of Lifestyle Modifications on Aging-Associated Diabetes Risk",
+ "Explain The Role of Longevity Genes in Protecting Against Diabetes",
+ "What are the types of diabetes",
+
+ "How many types of diabetes exist?",
+ "Is there a direct association between aging and susceptibility to having diabetes?",
+ "How does genetics influence the emergency of diabetes?",
+ "what genes are associated with diabetes?",
+ "What causes diabetes?",
+
+ "Does cycling reduce risk of diabetes?",
+ "How can GeneNetwork assist in identifying genetic factors involved in diabetes?",
+ "What specific tools within GeneNetwork are most useful for diabetes research, and how are they applied?",
+ "What role does insulin play in the regulation of blood glucose levels?",
+ "How does aging affect the risk of developing type 2 diabetes?",
+ "Can lifestyle changes reverse type 2 diabetes?"
+
+ ]
+ },
+ {
+ "level": "citizenscientist",
+ "domain": "aging",
+ "query": [
+ "List as many studies as you can that include rapamycin.",
+ "Why is it so diffuclut to map gene loci that control aging in humans?",
+ "What is apoptosis?",
+ "which genes are involved in the aging process",
+ "what causes the aging process",
+
+ "which genes are involved in aging",
+ "what genes are involved in the aging process",
+ "Describe the genotypes related to Alzheimers and dementia which have commonalities with those for aging.",
+ "Describe the genotypes related to Alzheimer's and dementia which have commonalities with those for aging.",
+ "What genetic factors influence aging in humans?",
+
+ "what genes are associated with aging?",
+ "Which genes are associated with aging in human",
+ "What is GeneNetwork and how does it relate to aging research?"
+
+ ]
+ }
+] \ No newline at end of file
diff --git a/gnqa/paper2_eval/data/testresp.json b/gnqa/paper2_eval/data/testresp.json
new file mode 100644
index 0000000..b40ee21
--- /dev/null
+++ b/gnqa/paper2_eval/data/testresp.json
@@ -0,0 +1,324 @@
+{
+ "1": {
+ "results": {
+ "completion": {
+ "id": "chatcmpl-AB4tgOqLfpDSZCayAF10UZVwALr7J",
+ "choices": [
+ {
+ "finish_reason": "stop",
+ "index": 0,
+ "message": {
+ "content": "Here are the studies that include rapamycin:\n\n1. **Yin L, Ye S, Chen Z, Zeng Y (2012)**: \"Rapamycin preconditioning attenuates transient focal cerebral ischemia/reperfusion injury in mice\" [1].\n2. **Spilman P, Podlutskaya N, Hart MJ, Debnath J, Gorostiza O, Bredesen D, Richardson A, Strong R, Galvan V (2010)**: \"Inhibition of mTOR by rapamycin abolishes cognitive deficits and reduces amyloid-beta levels in a mouse model of Alzheimer's disease\" [1].\n3. **Anisimov VN, Zabezhinski MA, Popovich IG, Piskunova TS, Semenchenko AV, Tyndyk ML, Yurova MN, Rosenfeld SV, Blagosklonny MV (2011)**: \"Rapamycin increases lifespan and inhibits spontaneous tumorigenesis in inbred female mice\" [2].\n4. **Lamming DW, Ye L, Katajisto P, Goncalves MD, Saitoh M, Stevens DM, et al. (2012)**: \"Rapamycin-induced insulin resistance is mediated by mTORC2 loss and uncoupled from longevity\" [5].\n5. **Tataranni T, Biondi G, Cariello M, Mangino M, Colucci G, Rutigliano M, et al. (2011)**: \"Rapamycin-induced hypophosphatemia and insulin resistance are associated with mTORC2 activation and klotho expression\" [5].\n6. **Wilkinson JE, Burmeister L, Brooks SV, Chan CC, Friedline S, Harrison DE, et al. (2012)**: \"Rapamycin slows aging in mice\" [7].\n7. **Reihl K, Seals D, Henson G, LaRocca T, Magerko K, Bosshardt G, et al. (2013)**: \"Dietary rapamycin selectively improves arterial function in old mice\" [7].\n8. **Miller RA, Harrison DE, Astle CM, Fernandez E, Flurkey K, Han M, et al. (2014)**: \"Rapamycin-mediated lifespan increase in mice is dose and sex dependent and metabolically distinct from dietary restriction\" [10].\n\nThese studies cover a range of topics including the effects of rapamycin on cerebral ischemia, Alzheimer's disease, lifespan extension, insulin resistance, aging, and arterial function.",
+ "role": "assistant"
+ }
+ }
+ ],
+ "created": 1727203796,
+ "model": "gpt-4o-2024-05-13",
+ "object": "chat.completion",
+ "system_fingerprint": "fp_e375328146",
+ "usage": {
+ "completion_tokens": 529,
+ "prompt_tokens": 1544,
+ "total_tokens": 2073
+ }
+ },
+ "search_results": {
+ "vector_search_results": [
+ {
+ "id": "57be0715-77c8-55e3-8239-56e1fa11a543",
+ "score": 0.6735553741455078,
+ "metadata": {
+ "text": "168. Yin L, Ye S, Chen Z, Zeng Y . Rapamycin preconditioning attenuates tran-\nsient focal cerebral ischemia/reperfusion injury in mice. Int J Neurosci. 2012;122:748756. doi: 10.3109/00207454.2012.721827\n 169. Spilman P, Podlutskaya N, Hart MJ, Debnath J, Gorostiza O, Bredesen \nD, Richardson A, Strong R, Galvan V . Inhibition of mTOR by rapamy-cin abolishes cognitive deficits and reduces amyloid-beta levels in a \nmouse model of Alzheimers disease. PLoS One. 2010;5:e9979. doi: \n10.1371/journal.pone.0009979",
+ "title": "2018 - Mechanisms of Vascular Aging.pdf",
+ "version": "v0",
+ "chunk_order": 262,
+ "document_id": "659b84b6-63dd-5bb1-80ee-7478ed3c47e3",
+ "extraction_id": "3e65812c-453e-53aa-83ab-92f2ce15da29",
+ "associatedQuery": "List as many studies as you can that include rapamycin."
+ }
+ },
+ {
+ "id": "03e62089-fef5-5ed5-bf7f-36ff595fbaea",
+ "score": 0.67023766040802,
+ "metadata": {
+ "text": "Anisimov VN, Zabezhinski MA, Popovich IG, Piskunova TS,\nSemenchenko AV, Tyndyk ML, Yurova MN, Rosenfeld SV,Blagosklonny MV (2011b) Rapamycin increases lifespan and\ninhibits spontaneous tumorigenesis in inbred female mice. Cell\nCycle 10:42304236\nAugustine JJ, Bodziak KA, Hricik DE (2007) Use of sirolimus in\nsolid organ transplantation. Drugs 67:369391\nBannister CA, Holden SE, Jenkins-Jones S, Morgan CL, Halcox JP,",
+ "title": "2016 - The dog aging project translational geroscience in companion.pdf",
+ "version": "v0",
+ "chunk_order": 81,
+ "document_id": "e841c6bd-78b8-56e1-b3dd-e2bcc8a0f590",
+ "extraction_id": "2c1fcce1-b723-5f9f-8f66-49ed7895f2ac",
+ "associatedQuery": "List as many studies as you can that include rapamycin."
+ }
+ },
+ {
+ "id": "fe5b60e5-ded6-5950-bc1c-72cb39e16234",
+ "score": 0.663162350654602,
+ "metadata": {
+ "text": "ACCEPTED MANUSCRIPTACCEPTED MANUSCRIPT\nmTOR complex 2 (mTORC2), the less clearly identified and less sensitive to rapamycin. Most information to \ndate on the r ole of mTOR has studied the insulin/nutrient signaling via the mTORC1 and significantly less in \nknown about the role of mTORC2 ( in this review, future references measure either mTORC1 or general mTOR \nactivity )[251]. Earlier this decade studies showed that decreasing TOR signaling, genetically or with rapamycin,",
+ "title": "2015 - Cellular and Molecular Biology of Aging Endothelial Cells.pdf",
+ "version": "v0",
+ "chunk_order": 106,
+ "document_id": "815d7f3e-e219-502f-aba0-57a68ae787d3",
+ "extraction_id": "86f9502b-7a3a-501f-9053-8af1d37043b4",
+ "associatedQuery": "List as many studies as you can that include rapamycin."
+ }
+ },
+ {
+ "id": "d7dcefa4-133c-594c-b8a8-38fe945c6b5c",
+ "score": 0.6623571120287721,
+ "metadata": {
+ "text": "Harrison, D.E., Strong, R., Sharp, Z.D., Nelson, J.F., Astle, C.M., Flurkey, K.,Nadon, N.L., Wilkinson, J.E., Frenkel, K., Carter, C.S., et al. (2009). Rapamycin\nCell148, January 20, 2012 2012 Elsevier Inc. 55",
+ "title": "2012 - Aging, Rejuvenation, and Epigenetic.pdf",
+ "version": "v0",
+ "chunk_order": 139,
+ "document_id": "bde26feb-f423-51b0-89ec-6f079bfc8b17",
+ "extraction_id": "d23b6aab-f299-5370-b3b6-0615112681f0",
+ "associatedQuery": "List as many studies as you can that include rapamycin."
+ }
+ },
+ {
+ "id": "907d7d31-04db-5f66-b390-7740142af182",
+ "score": 0.6616793870925903,
+ "metadata": {
+ "text": "96. Lamming DW, Ye L, Katajisto P, Goncalves MD, Saitoh M, Stevens DM, etal. Rapamycin- \ninduced insulin resistance is mediated by mTORC2 loss and uncoupled from longevity. \nScience. 2012;335:163843.\n 97. Tataranni T, Biondi G, Cariello M, Mangino M, Colucci G, Rutigliano M, etal. Rapamycin- \ninduced hypophosphatemia and insulin resistance are associated with mTORC2 activation \nand klotho expression. Am J Transplant. 2011;11(8):165664.",
+ "title": "2020 - Clinical Genetics and Genomics of Aging.pdf",
+ "version": "v0",
+ "chunk_order": 200,
+ "document_id": "62b635c3-040e-512a-b016-6ef295308a1e",
+ "extraction_id": "a47672ed-9f4d-5aa8-8b7e-f10753246a6e",
+ "associatedQuery": "List as many studies as you can that include rapamycin."
+ }
+ },
+ {
+ "id": "40cbc230-7175-522e-b0ae-3901f2cfac0b",
+ "score": 0.655274510383606,
+ "metadata": {
+ "text": "ing these aspects in future studies on the effects of resveratrol could help to study in \ngreater depth the mechanisms of action of this compound [56].\n Rapamycin\nRapamycin is a macrolide isolated from Streptomyces hygroscopicus, a bacteria \nfrom Pascua Island (Rapa Nui). It has functions as an antibiotic, an immune sup-\npressant drug, and it is also proposed as a CRM.After the first studies, it was found \nthat rapamycin could induce the extension of the replicative life of yeast through the",
+ "title": "2020 - Clinical Genetics and Genomics of Aging.pdf",
+ "version": "v0",
+ "chunk_order": 1635,
+ "document_id": "62b635c3-040e-512a-b016-6ef295308a1e",
+ "extraction_id": "42c88d1d-4bb6-50f8-9010-379e15650d96",
+ "associatedQuery": "List as many studies as you can that include rapamycin."
+ }
+ },
+ {
+ "id": "a9666b11-4567-52dd-90c8-be2238dafdcb",
+ "score": 0.6544143557548523,
+ "metadata": {
+ "text": "[257] Wilkinson JE, Burmeister L, Brooks SV, Chan CC, Friedline S, Harrison DE, et al. Rapamycin slows \naging in mi ce. Aging Cell. 2012;11:675 -82. \n[258] Selman C, Tullet JM, Wieser D, Irvine E, Lingard SJ, Choudhury AI, et al. Ribosomal protein S6 kinase 1 \nsignaling regulates mammalian life span. Science. 2009;326:140 -4. \n[259] Reihl K, Seals D, Henson G, LaRocca T, Mag erko K, Bosshardt G, et al. Dietary rapamycin selectively \nimproves arterial function in old mice. FASEB Journal. 2013;27:1194.17.",
+ "title": "2015 - Cellular and Molecular Biology of Aging Endothelial Cells.pdf",
+ "version": "v0",
+ "chunk_order": 255,
+ "document_id": "815d7f3e-e219-502f-aba0-57a68ae787d3",
+ "extraction_id": "0e789eef-b085-5fc2-b10a-8572bc28fa1b",
+ "associatedQuery": "List as many studies as you can that include rapamycin."
+ }
+ },
+ {
+ "id": "729598dc-94e6-5f52-ae19-071c959c7dd2",
+ "score": 0.6457819553743639,
+ "metadata": {
+ "text": "29. Wilkinson JE, Burmeister L, Brooks SV, Chan C-C, Friedline S, Harrison DE,\net al. Rapamycin slows aging in mice. Aging Cell. 2012;11:675 82.\n30. Lamming DW, Ye L, Katajisto P, Goncalves MD, Saitoh M, Stevens DM, et al.\nRapamycin-induced insulin resistance is mediated by mTORC2 loss and\nuncoupled from longevity. Science. 2012;335:1638 43.\n31. Zampieri M, Ciccarone F, Calabrese R, Franceschi C, Brkle A, Caiafa P.\nReconfiguration of DNA methylation in aging. Mech Ageing Dev. 2015;151:60 70.",
+ "title": "2017 - Epigenetic aging signatures in mice livers.pdf",
+ "version": "v0",
+ "chunk_order": 102,
+ "document_id": "b20b11a6-1490-51b8-9218-c441a2e65ba7",
+ "extraction_id": "5d4bf4c1-5bb4-5de6-a1bb-0485163a5373",
+ "associatedQuery": "List as many studies as you can that include rapamycin."
+ }
+ },
+ {
+ "id": "cbc86652-98e1-5464-a0ce-2272111246df",
+ "score": 0.640692949295044,
+ "metadata": {
+ "text": "files [55, 62]. Of note, rapamycin in particular appears to\ninduce additional changes u nrelated to age-associated\nchanges. While both CR and rapamycin induced these\nnon-age-related effects, this effect was much more marked\nfor rapamycin. These non age-related epigenetic changes\ninclude gains of methylation at genes, enhancers and CpG\nislands and losses of methylation at genes and enhancers.\nConceivably, such non age-related effects of rapamycin in",
+ "title": "2017 - Diverse interventions that extend mouse.pdf",
+ "version": "v0",
+ "chunk_order": 120,
+ "document_id": "dc7ad71a-a4d7-5901-a016-9a6fb2b91a2f",
+ "extraction_id": "d634b92e-0802-5ba8-a4c5-9e45462cd7d5",
+ "associatedQuery": "List as many studies as you can that include rapamycin."
+ }
+ },
+ {
+ "id": "f8630239-fd67-5214-a5cd-f965d878f712",
+ "score": 0.6388615369796753,
+ "metadata": {
+ "text": "23\n 94. Chakrabarti P, English T, Shi J, Smas CM, Kandror KV .Mammalian target of rapamycin \ncomplex 1 suppresses lipolysis, stimulates lipogenesis, and promotes fat storage. Diabetes. \n2010;59:77581.\n 95. Miller RA, Harrison DE, Astle CM, Fernandez E, Flurkey K, Han M, et al. Rapamycin- \nmediated lifespan increase in mice is dose and sex dependent and metabolically distinct from \ndietary restriction. Aging Cell. 2014;13:46877.",
+ "title": "2020 - Clinical Genetics and Genomics of Aging.pdf",
+ "version": "v0",
+ "chunk_order": 199,
+ "document_id": "62b635c3-040e-512a-b016-6ef295308a1e",
+ "extraction_id": "a47672ed-9f4d-5aa8-8b7e-f10753246a6e",
+ "associatedQuery": "List as many studies as you can that include rapamycin."
+ }
+ }
+ ],
+ "kg_search_results": null
+ }
+ }
+ },
+ "2": {
+ "results": {
+ "completion": {
+ "id": "chatcmpl-AB4tsENcPR5A4o83baOVNBlgNGHv6",
+ "choices": [
+ {
+ "finish_reason": "stop",
+ "index": 0,
+ "message": {
+ "content": "Mapping gene loci that control aging in humans is challenging due to several factors:\n\n1. **Complexity of the Phenotype**: Aging is one of the most complex phenotypes, involving multiple genes that mediate the process through numerous different pathophysiological processes and diseases [4]. This complexity makes it difficult to pinpoint specific gene loci that control aging.\n\n2. **Genetic Diversity and Environmental Heterogeneity**: Human longevity studies face inherent problems such as high degrees of environmental heterogeneity and genetic diversity, which complicate the identification of consistent genetic markers [7].\n\n3. **Rare vs. Common Variants**: Finding common genetic variants that account for the entire genetic component of human lifespan variability has proven difficult. If rare variants explain most of the genetic variation in aging among humans, new genotyping techniques and analysis methods are required to identify these genes and pathways [2], [3].\n\n4. **Numerous Loci Involved**: More than 1000 loci exhibit age-dependent changes in gene expression, but not all of these loci are causally involved in aging. The sheer number of loci to sort out presents a substantial problem [6].\n\n5. **Lack of Consistent Associations**: Common variants within genes involved in aging pathways have not been consistently associated with lifespan in humans, indicating the complexity and variability of the genetic factors involved [10].\n\nThese factors collectively contribute to the difficulty in mapping gene loci that control aging in humans.",
+ "role": "assistant"
+ }
+ }
+ ],
+ "created": 1727203808,
+ "model": "gpt-4o-2024-05-13",
+ "object": "chat.completion",
+ "system_fingerprint": "fp_a5d11b2ef2",
+ "usage": {
+ "completion_tokens": 291,
+ "prompt_tokens": 1149,
+ "total_tokens": 1440
+ }
+ },
+ "search_results": {
+ "vector_search_results": [
+ {
+ "id": "bf384c33-974b-57c3-867d-3515e1d45c49",
+ "score": 0.698593178433386,
+ "metadata": {
+ "text": "that is differentiated at hundreds of loci. Many ofthe loci that control aging in Drosophila will not have\nthe same effect on human aging. On the other hand,we expect that other loci will work in a parallelmanner in humans. We have no way of knowing a\npriori which group any particular locus will belong\nin. Thus, the individual mutants that increase\nDrosophila lifespan may or may not come from loci",
+ "title": "2002 - Pharmacology, Genomics, and the Evolutionary Biology.pdf",
+ "version": "v0",
+ "chunk_order": 32,
+ "document_id": "1bc636a3-6ce0-5fea-b549-0dae90a78f1b",
+ "extraction_id": "aa03a9d5-4e30-5fb0-bee1-6dd8e6a549b3",
+ "associatedQuery": "Why is it so difficult to map gene loci that control aging in humans?"
+ }
+ },
+ {
+ "id": "62e2bf90-fdb9-5499-a063-cee6c92feb40",
+ "score": 0.6800800764584567,
+ "metadata": {
+ "text": "understanding of molecular mechanisms underlyingthe human ageing process. Like other complexhuman traits, nding common variants that accountfor the entire genetic component of human lifespan\nvariability has proved difcult. If rare variants rather\nthan common variants explain most of the genetic vari-ation in ageing among humans, new genotypingtechniques and new analysis methods must be devel-oped to nd genes and pathways involved in ageing.Next-generation sequencing technologies are faster",
+ "title": "2010 - Genetics and genomics of human ageing.pdf",
+ "version": "v0",
+ "chunk_order": 73,
+ "document_id": "633f3149-e966-53ef-aa7d-b759398ed541",
+ "extraction_id": "04babc6e-5138-5804-a150-70254859800d",
+ "associatedQuery": "Why is it so difficult to map gene loci that control aging in humans?"
+ }
+ },
+ {
+ "id": "25a0cb1d-0207-5197-9b6a-389b16c1f17e",
+ "score": 0.6800800764584567,
+ "metadata": {
+ "text": "understanding of molecular mechanisms underlyingthe human ageing process. Like other complexhuman traits, nding common variants that accountfor the entire genetic component of human lifespan\nvariability has proved difcult. If rare variants rather\nthan common variants explain most of the genetic vari-ation in ageing among humans, new genotypingtechniques and new analysis methods must be devel-oped to nd genes and pathways involved in ageing.Next-generation sequencing technologies are faster",
+ "title": "2011 - Genetics and genomics of human ageing.pdf",
+ "version": "v0",
+ "chunk_order": 73,
+ "document_id": "08eee102-d627-5f1b-84c7-603c38981adf",
+ "extraction_id": "27e291f1-e6bf-5e76-9245-522de74ea63b",
+ "associatedQuery": "Why is it so difficult to map gene loci that control aging in humans?"
+ }
+ },
+ {
+ "id": "f1f870c1-b1ed-5eeb-8831-3484d35414b2",
+ "score": 0.6764066228038036,
+ "metadata": {
+ "text": "Several explanations are possible for the lack of genome-\nwide signicant ndings. First, mortality is arguably 1 ofthe most complex phenotypes, and several trajectories to-ward extreme old age have been identied (Evert et al.,2003). Multiple genes could mediate the aging process butwould have their effects through numerous different patho-physiological processes and diseases that act as intermediate",
+ "title": "2011 - A genome-wide association study of aging.pdf",
+ "version": "v0",
+ "chunk_order": 83,
+ "document_id": "8e9c1150-1047-54a2-bf85-1cc5000a6811",
+ "extraction_id": "253a4339-29d4-58c2-8a01-5137d94873b6",
+ "associatedQuery": "Why is it so difficult to map gene loci that control aging in humans?"
+ }
+ },
+ {
+ "id": "e0cce1c5-8709-5218-99b6-48a6ba242931",
+ "score": 0.6634044248880278,
+ "metadata": {
+ "text": "effect fundamental mechanisms of aging (14, 16). The drawbacksof such studies include the improbability of picking the right geneto study the myriad of known and unknown genes affecting theprocess of interest (17). The linkage study described heremarkedly improves the efficiency of such association studies bydefining a region likely to contain polymorphism(s) with signif-icant influence on life span.\nAdditional association studies with these families and repli-",
+ "title": "2001 - A genome-wide scan for linkage to human.pdf",
+ "version": "v0",
+ "chunk_order": 40,
+ "document_id": "1431984a-82d9-51d4-a23c-5f76a02ab554",
+ "extraction_id": "17246c43-2e44-579b-867d-3dc7150ceedd",
+ "associatedQuery": "Why is it so difficult to map gene loci that control aging in humans?"
+ }
+ },
+ {
+ "id": "86393802-9171-57d8-806d-6d2ccfb3f0b2",
+ "score": 0.6629835959956287,
+ "metadata": {
+ "text": "[12]More than 1000\nloci exhibit age-dependent changes in geneexpression (1264 genes). This is a substantialproblem, because not all of these loci will be causally\ninvolved in aging, and there are so many to sort out.\nAn additional application of gene chip technologyis to compare ies with and without a lifespanmodulating physiological treatment. Pletcher et al.",
+ "title": "2002 - Pharmacology, Genomics, and the Evolutionary Biology.pdf",
+ "version": "v0",
+ "chunk_order": 36,
+ "document_id": "1bc636a3-6ce0-5fea-b549-0dae90a78f1b",
+ "extraction_id": "e501662f-ffca-563b-97a7-b682a5d7f6ba",
+ "associatedQuery": "Why is it so difficult to map gene loci that control aging in humans?"
+ }
+ },
+ {
+ "id": "380ca35e-b42b-59b4-aef7-aaf2ba3bb59d",
+ "score": 0.6576070589540312,
+ "metadata": {
+ "text": "span and have yielded insights into potential biological\npathways and processes related to aging. Despite these\nsuccesses, several problems are inherent in human\nlongevity studies including potentially high degrees ofenvironmental heterogeneity, genetic diversity, and lack of\nbirth matched controls, among others [ 8].\nInbred mouse strains represent a powerful alternative for\nidentifying genes underlying complex trait genes such as\nlongevity [ 9]. Initial mapping approaches include quanti-",
+ "title": "2015 - A Chromosome 13 locus is associated with male-specific mortality in mice.pdf",
+ "version": "v0",
+ "chunk_order": 8,
+ "document_id": "ad8f2626-87fb-520e-8cef-ee9a9cc3ab0b",
+ "extraction_id": "df0b4be9-3393-5642-a722-ccafffb60df8",
+ "associatedQuery": "Why is it so difficult to map gene loci that control aging in humans?"
+ }
+ },
+ {
+ "id": "eff6939d-b0bf-5442-ad2a-e141e6acfb3d",
+ "score": 0.6557591567254893,
+ "metadata": {
+ "text": "86\nDespite the progress in elucidating the genetic basis of\nlongevity and aging in many organism models, ranging from\nyeast and nematodes to mice, a large-scale, systematic\napproach for the exploration of aging in humans is still atthe beginning.\n87So far, a variation in the gene coding for\napolipoprotein E ( APOE )88,89and forkhead box O3A gene\n(FOXO3A ) have been consistently associated with longevity\nin various populations.90,91Recently, stress response genes,",
+ "title": "2011 - Clinical aspects and molecular diagnostics of skin aging.pdf",
+ "version": "v0",
+ "chunk_order": 66,
+ "document_id": "e32f8f2c-d3ad-5dae-a393-9bd87c370ebe",
+ "extraction_id": "1a0f6c6d-fc06-54d0-ba5f-87a6cb57b718",
+ "associatedQuery": "Why is it so difficult to map gene loci that control aging in humans?"
+ }
+ },
+ {
+ "id": "f6ed1f0c-e4ea-5459-bb63-136decc11eee",
+ "score": 0.6523834661359631,
+ "metadata": {
+ "text": "discover core mechanisms of regulation.ANALYSIS OF HUMAN VARIATION IN\nTHE GENETIC CONTROL OF LONGEVITY\nHeritability studies have convincingly demonstrated that at\nleast some fraction of human lifespan is heritable. In tandem,\nlarge-scale genome-wide association studies (GWAS) have\nidentied numerous loci associated with age-related traits\n(Buniello et al., 2019). While genetic studies have functionally\nshown an inverse eect of multiple age-related, disease-",
+ "title": "2021 - Footprints in the Sand Deep Taxonomic Comparisons in Vertebrate Genomics to Unveil the Genetic Programs of Human Longevity.pdf",
+ "version": "v0",
+ "chunk_order": 13,
+ "document_id": "0dc45abe-ab02-5b07-9916-7093b53323c0",
+ "extraction_id": "34223e0e-590c-5f26-b120-b7250cd91b99",
+ "associatedQuery": "Why is it so difficult to map gene loci that control aging in humans?"
+ }
+ },
+ {
+ "id": "146229ff-4d17-5319-88e4-6040f30cf0c1",
+ "score": 0.6509123236484721,
+ "metadata": {
+ "text": "Kenyon, 2010; Vellai et al., 2003 ). However, in humans,\ncommon variants within genes involved in these pathways\nhave not been consistently associated with lifespan ( Chris-tensen et al., 2006; Kenyon, 2010; Kuningas et al., 2008;\nVijg and Suh, 2005 ).\nThe lack of success in the identication of genes related\nto aging in humans may be due to the complexity of the\nphenotype. One approach to investigate aging and longevity\nis to compare frequencies of genetic variants between no-",
+ "title": "2011 - A genome-wide association study of aging.pdf",
+ "version": "v0",
+ "chunk_order": 16,
+ "document_id": "8e9c1150-1047-54a2-bf85-1cc5000a6811",
+ "extraction_id": "beab62d0-2e6f-5d77-b0a2-7375a9ed1364",
+ "associatedQuery": "Why is it so difficult to map gene loci that control aging in humans?"
+ }
+ }
+ ],
+ "kg_search_results": null
+ }
+ }
+ }
+} \ No newline at end of file
diff --git a/gnqa/paper2_eval/src/parse_r2r_result.ipynb b/gnqa/paper2_eval/src/parse_r2r_result.ipynb
new file mode 100644
index 0000000..8ceac72
--- /dev/null
+++ b/gnqa/paper2_eval/src/parse_r2r_result.ipynb
@@ -0,0 +1,262 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 116,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Parse RAGAS json output"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 117,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "import sys\n",
+ "\n",
+ "verbose = 0\n",
+ "read_file = '/data/code/gn-ai/gnqa/paper2_eval/data/rag_out_1.json'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 118,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "def iterate_json(obj, thedict):\n",
+ " if isinstance(obj, dict):\n",
+ " for key, val in obj.items():\n",
+ " if (key == \"text\"):\n",
+ " thedict[\"contexts\"].append(val.replace(\"\\n\", \" \").strip())\n",
+ " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
+ " elif (key == \"associatedQuery\"):\n",
+ " #thedict[\"answer\"] = val#.replace(\"\\n\", \" \").strip()\n",
+ " thedict[\"question\"] = val#.replace(\"\\n\", \" \").strip()\n",
+ " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
+ " elif (key == \"id\"):\n",
+ " thedict[\"id\"].append(val.replace(\"\\n\", \" \").strip())\n",
+ " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
+ " elif (key == \"title\"):\n",
+ " thedict[\"titles\"].append(val.replace(\"\\n\", \" \").strip())\n",
+ " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
+ " elif (key == \"document_id\"):\n",
+ " thedict[\"document_id\"].append(val.replace(\"\\n\", \" \").strip())\n",
+ " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
+ " elif (key == \"content\"):\n",
+ " thedict[\"answer\"] = val.replace(\"\\n\", \" \").strip()\n",
+ " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
+ " elif (key == \"extraction_id\"):\n",
+ " thedict[\"extraction_id\"].append(val.replace(\"\\n\", \" \").strip())\n",
+ " print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
+ " else:\n",
+ " if (len(obj.items()) == 1 ):\n",
+ " print(key, \" --> \", val)\n",
+ " iterate_json(val, thedict)\n",
+ " elif isinstance(obj, list):\n",
+ " for item in obj:\n",
+ " iterate_json(item, thedict)\n",
+ "\n",
+ "# this should be a json file with a list of input files and an output file\n",
+ "with open(read_file, \"r\") as r_file:\n",
+ " result_file = json.load(r_file)\n",
+ "\n",
+ "ragas_output = {\n",
+ " \"titles\": [],\n",
+ " \"extraction_id\": [],\n",
+ " \"document_id\": [],\n",
+ " \"id\": [],\n",
+ " \"contexts\": [],\n",
+ " \"answer\": \"\",\n",
+ " \"question\": \"\"}\n",
+ "vector_search_results = result_file[\"vector_search_results\"]\n",
+ "choices = result_file[\"choices\"]\n",
+ "iterate_json(vector_search_results, ragas_output)\n",
+ "iterate_json(choices, ragas_output)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 119,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " \"titles\": [\n",
+ " \"2011 - Annotating individual human genomes.pdf\",\n",
+ " \"2011 - Annotating individual human genomes.pdf\",\n",
+ " \"2011 - Annotating individual human genomes.pdf\",\n",
+ " \"2012 - Systems Biology Approaches to Nutrition.pdf\",\n",
+ " \"2006 - \\u03b22-adrenergic receptor and UCP3 variants modulate the relationship between age and type 2 diabetes mellitus.pdf\",\n",
+ " \"2001 - Demography in the age of genomics.pdf\",\n",
+ " \"2012 - Systems Biology Approaches to Nutrition.pdf\",\n",
+ " \"2004 - A genome scan for diabetic nephropathy in African Americans.pdf\",\n",
+ " \"2011 - Annotating individual human genomes.pdf\",\n",
+ " \"2012 - Systems Biology Approaches to Nutrition.pdf\"\n",
+ " ],\n",
+ " \"extraction_id\": [\n",
+ " \"80d78615-8424-5478-a01b-73e220bc0345\",\n",
+ " \"80d78615-8424-5478-a01b-73e220bc0345\",\n",
+ " \"80d78615-8424-5478-a01b-73e220bc0345\",\n",
+ " \"eb3de845-98db-505c-bb7f-c0f3259875fc\",\n",
+ " \"acf69ed8-c7b0-5d9f-8005-de020c9cf699\",\n",
+ " \"5f24a851-1de6-5b6e-8230-2da08806b01a\",\n",
+ " \"eb3de845-98db-505c-bb7f-c0f3259875fc\",\n",
+ " \"01ec7832-8a80-5f5e-aa26-3648f572c4a1\",\n",
+ " \"c17c74fa-12a3-5072-bb48-c179055db14d\",\n",
+ " \"eb3de845-98db-505c-bb7f-c0f3259875fc\"\n",
+ " ],\n",
+ " \"document_id\": [\n",
+ " \"f7b5d738-3f0b-5074-9c21-f6b443b4e07f\",\n",
+ " \"f7b5d738-3f0b-5074-9c21-f6b443b4e07f\",\n",
+ " \"f7b5d738-3f0b-5074-9c21-f6b443b4e07f\",\n",
+ " \"6955478b-950d-5d29-b24c-3a5ca656f3ae\",\n",
+ " \"0ea34c04-5d09-5a32-89a7-c3add179927a\",\n",
+ " \"0f07fa43-feb6-5656-b7e7-b8faa86f5623\",\n",
+ " \"6955478b-950d-5d29-b24c-3a5ca656f3ae\",\n",
+ " \"5798fb6b-b3e6-57c4-9823-5428853dbfa1\",\n",
+ " \"f7b5d738-3f0b-5074-9c21-f6b443b4e07f\",\n",
+ " \"6955478b-950d-5d29-b24c-3a5ca656f3ae\"\n",
+ " ],\n",
+ " \"id\": [\n",
+ " \"7656b48b-d191-516e-9753-d34efedd4812\",\n",
+ " \"d1af5c82-d226-5980-b5d9-90d7558d1880\",\n",
+ " \"bb2a67ec-135b-5d55-b33d-74b1dc085685\",\n",
+ " \"11fc663d2-2833-51e7-ae6a-55b007a6e27c\",\n",
+ " \"bb55a705-7399-550e-8285-07c33654b909\",\n",
+ " \"9bff43c0-fd12-572e-9996-24957edd17d2\",\n",
+ " \"2df84ccc-0d32-582e-bda6-9cd46bee5378\",\n",
+ " \"ff30f187-d5c3-5d01-8026-0588a77e9f44\",\n",
+ " \"57dc1ee5-4252-52c3-92cb-e2ac36cdc4d6\",\n",
+ " \"ff801099-e737-57b1-91af-a4cea20adb87\"\n",
+ " ],\n",
+ " \"contexts\": [\n",
+ " \"gene interaction and high predictive value, PLoS One 3 (5) (2008) e2031,doi:10.1371/journal.pone.0002031 . [107] M. van Hoek, A. Dehghan, J.C. Witteman, C.M. van Duijn, A.G. Uitterlinden, B.A. Oostra, A. Hofman, E.J. Sijbrands, A.C. Janssens, Predicting type 2 diabetes based on polymorphisms from genome-wide association studies: a population-based study, Diabetes 57 (11) (Nov 2008) 3122 3128. [108] Q. Lu, Y. Song, X. Wang, S. Won, Y. Cui, R.C. Elston, The effect of multiple genetic\",\n",
+ " \"variants in predicting the risk of type 2 diabetes, BMC Proc 3 (Suppl 7) (Dec 15 2009) S49. [109] K. Miyake, W. Yang, K. Hara, K. Yasuda, Y. Horikawa, H. Osawa, H. Furuta, et al., Construction of a prediction model for type 2 diabetes mellitus in the Japanese population based on 11 genes with strong evidence of the association, J. Hum. Genet. 54 (4) (Apr 2009) 236 241 [Epub 2009 Feb 27]. [110] P.J. Talmud, A.D. Hingorani, J.A. Cooper, M.G. Marmot, E.J. Brunner, M. Kumari, M.\",\n",
+ " \"type 2 diabetes risk, Diabetes 57 (11) (Nov 2008) 3129 3135. [103] Q. Lu, R.C. Elston, Using the optimal receiver operating characteristic curve to design a predictive genetic test, exempli ed with type 2 diabetes, Am. J. Hum. Genet. 82 (3) (Mar 2008) 641 651. [104] V. Lyssenko, A. Jonsson, P. Almgren, N. Pulizzi, B. Isomaa, T. Tuomi, G. Berglund, D. Altshuler, P. Nilsson, L. Groop, Clinical risk factors, DNA variants, and the development of type 2 diabetes, N. Engl. J. Med. 359 (21) (Nov 20 2008)\",\n",
+ " \"insulin resistance, hypertension, and dyslipidemia (Obesity Education Initiative Expert Panel, 1998 ). Insulin resist-ance increases with age, and the incidence of diabetes rises sharply in the elderly (American Diabetes Association, 2010a ). In a few patients, genetic mutations appear to be associ- ated with T2D (Roche et al. , 2005 ; American Diabetes Association, 2010a ). For example, recent work using the DPP data has led to the identi cation of 27 single nucle-\",\n",
+ " \"19. Permutt MA, Wasson J, Cox N: Genetic epidemiology of diabe- tes. J Clin Invest 2005, 115:1431-1439. 20. Barroso I: Genetics of Type 2 diabetes. Diabet Med 2005, 22:517-535. 21. Parikh H, Groop L: Candidate genes for type 2 diabetes. Rev Endocr Metab Disord 2004, 5:151-176. 22. Lohmueller KE, Pearce CL, Pike M, Lander ES, Hirschhorn JN: Meta- analysis of genetic association studies supports a contribu- tion of common variants to su sceptibility to common dis- ease. Nat Genet 2003, 33:177-182.\",\n",
+ " \"insulin-dependent diabetes and schizophrenia, twin studies have demon-strated the existence of a significant genetic component (Kyvik et al., 1995;Plomin et al., 1994). Genetic factors also influence cardiovascular diseaseswhich occur in early or midlife, while for cardiovascular diseases occur-ring late in life there is little evidence of a genetic effect (Marenberg et al.,1994). Dementia has a very strong genetic component, not only withregard to early-onset monogenic types but also to late-onset\",\n",
+ " \"Three categories of increased risk of developing diabetes are currently recognized by the ADA: an FPG between 5.6 and 6.9 mmol/L (100 and 125 mg/dL), de ned as having impaired fasting glucose (IFG); a 2 - h OGTT between 7.8 and 11 mmol/L (140 and 199 mg/dL), de ned as having impaired glucose tolerance (IGT); an A1C between 5.7 and 6.4% with values between 6.0 and 6.4 considered very high risk (American Diabetes Association, 2010a ). It is estimated that approximately one - fourth of indi-\",\n",
+ " \"20 90 D20S451 0.006 10.7 5.4 (34) 8.42 5.4 (61) 0.30 (long duration) Interaction with age at diagnosis of diabetes 19 1 D1S1665 0.004 37.4 8.1 (66) 41.2 8.3 (81) 0.23 (early onset) 2 159 D2S1399/D2S1353 0.023 40.8 8.2 (53) 38.8 8.5 (94) 0.16 (late onset) 3 135 D3S2460 0.036 37.7 8.6 (66) 41.0 8.0 (81) 0.16 (early onset) 4 146 D4S1625 0.005 37.9 7.4 (52) 40.4 8.9 (95) 0.23 (early onset) 65 5 D6S2427 0.024 38.0 7.8 (70) 40.9 8.8 (77) 0.18 (early onset)\",\n",
+ " \"prevention, even though there are great concerns about how such information will be perceived and utilized [68]. Recent studies in diabetes and cardiovascular disease suggest that the addition of previously associated genetic risk loci in clinical risk models of those diseases increases both discriminative and predictive accuracy, albeit only marginally. Typically, the strongest predictors of disease onset are known clinical risk factors such as body mass index, age, or\",\n",
+ " \"and sex, weight for height greater than the 85th percentile, or weight greater than 120% of ideal for height) who also have any two of the following risk factors: family history of T2D ( rst - or second - degree relative), maternal history of diabetes or GDM during child s gestation, member of high - risk race/ethnicity group (Native American, African -(40 in) in men is considered an indicator of increased\"\n",
+ " ],\n",
+ " \"answer\": \"The factors that help predict early onset of diabetes include: 1. **Genetic Factors**: Specific genetic mutations and polymorphisms have been associated with an increased risk of developing type 2 diabetes (T2D) [1], [2], [4], [5], [6], [8]. 2. **Clinical Risk Factors**: Known clinical risk factors such as body mass index (BMI), age, and sex are strong predictors of disease onset [9]. 3. **Family History**: A family history of T2D, particularly in first- or second-degree relatives, is a significant risk factor [10]. 4. **Maternal History**: A maternal history of diabetes or gestational diabetes mellitus (GDM) during the childs gestation increases the risk [10]. 5. **Ethnicity**: Being a member of a high-risk race/ethnicity group, such as Native American or African American, is associated with a higher risk [10]. 6. **Impaired Glucose Levels**: Impaired fasting glucose (IFG), impaired glucose tolerance (IGT), and elevated A1C levels are indicators of increased risk [7]. 7. **Insulin Resistance and Related Conditions**: Conditions such as insulin resistance, hypertension, and dyslipidemia are linked to a higher risk of developing diabetes [4]. These factors collectively contribute to the prediction of early onset diabetes.\",\n",
+ " \"question\": \"List any factors that help predict early onset of diabetes.\"\n",
+ "}\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(json.dumps(ragas_output, indent=2))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/gnqa/paper2_eval/src/parse_r2r_result.py b/gnqa/paper2_eval/src/parse_r2r_result.py
new file mode 100644
index 0000000..b30f2e7
--- /dev/null
+++ b/gnqa/paper2_eval/src/parse_r2r_result.py
@@ -0,0 +1,44 @@
+import json
+import sys
+
+read_file = '/data/code/gn-ai/gnqa/paper2_eval/data/rag_out_1.json'
+
+def iterate_json(obj, thedict):
+ if isinstance(obj, dict):
+ for key, val in obj.items():
+ if (key == "text"):
+ thedict["contexts"].append(val.replace("\n", " ").strip())
+ print("Key -> {0}\tValue -> {1}".format(key,val))
+ elif (key == "metadata"):
+ thedict["answer"] = val#.replace("\n", " ").strip()
+ print("Key -> {0}\tValue -> {1}".format(key,val))
+ elif (key == "id"):
+ print("Key -> {0}\tValue -> {1}".format(key,val))
+ elif (key == "associatedQuery"):
+ thedict["question"] = val.replace("\n", " ").strip()
+ print("Key -> {0}\tValue -> {1}".format(key,val))
+ elif (key == "title"):
+ print("Key -> {0}\tValue -> {1}".format(key,val))
+ elif (key == "document_id"):
+ print("Key -> {0}\tValue -> {1}".format(key,val))
+ else:
+ if (len(obj.items()) == 1 ):
+ print(key, " --> ", val)
+ iterate_json(val, thedict)
+ elif isinstance(obj, list):
+ for item in obj:
+ iterate_json(item, thedict)
+
+# this should be a json file with a list of input files and an output file
+with open(read_file, "r") as r_file:
+ result_file = json.load(r_file)
+
+ragas_output = {
+ "contexts": [],
+ "titles": [],
+ "answer": "",
+ "question": ""}
+vector_search_results = result_file["vector_search_results"]
+iterate_json(vector_search_results, ragas_output)
+
+print(json.dumps(ragas_output, indent=2)) \ No newline at end of file
diff --git a/gnqa/paper2_eval/src/run_questions.py b/gnqa/paper2_eval/src/run_questions.py
new file mode 100644
index 0000000..5e0b28b
--- /dev/null
+++ b/gnqa/paper2_eval/src/run_questions.py
@@ -0,0 +1,45 @@
+import json
+import sys
+import os
+
+from r2r import R2R, Document, GenerationConfig, R2RClient
+
+
+def writeDatasetFile(responses, outp_file):
+ print(outp_file)
+ output = json.dumps(responses, indent=2)
+ if os.path.exists(outp_file):
+ with open(outp_file, "a") as the_data:
+ the_data.write('' + output)
+ else:
+ with open(outp_file, "a") as the_data:
+ the_data.write(output)
+
+client = R2RClient("http://localhost:8000")
+
+health_resp = client.health()
+
+print("The R2R client's health status is {0}".format(health_resp))
+
+questions = [
+ "List as many studies as you can that include rapamycin.",
+ "Why is it so difficult to map gene loci that control aging in humans?"
+]
+rag_response = {}
+
+ndx = 1
+for question in questions:
+ rag_response[str(ndx)] = client.rag(question)
+ ndx = ndx + 1
+
+#print(json.dumps(rag_response, indent=2))
+
+
+try:
+ read_file = str(sys.argv[1])
+ out_file = str(sys.argv[2])
+except:
+ exit('Example use "python run_questions.py ../data/questions/human/de/aging.json ../data/responses/human/de/aging_resp.json"')
+
+
+writeDatasetFile(rag_response, out_file) \ No newline at end of file