aboutsummaryrefslogtreecommitdiff
path: root/gnqa/paper2_eval/src/parse_r2r_result.ipynb
blob: 8ceac7228c68a66c9328a221032ebb74e12498f4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Parse RAGAS json output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import sys\n",
    "\n",
    "verbose = 0\n",
    "read_file = '/data/code/gn-ai/gnqa/paper2_eval/data/rag_out_1.json'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "def iterate_json(obj, thedict):\n",
    "    if isinstance(obj, dict):\n",
    "        for key, val in obj.items():\n",
    "            if (key == \"text\"):\n",
    "                thedict[\"contexts\"].append(val.replace(\"\\n\", \" \").strip())\n",
    "                print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
    "            elif (key == \"associatedQuery\"):\n",
    "                #thedict[\"answer\"] = val#.replace(\"\\n\", \" \").strip()\n",
    "                thedict[\"question\"] = val#.replace(\"\\n\", \" \").strip()\n",
    "                print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
    "            elif (key == \"id\"):\n",
    "                thedict[\"id\"].append(val.replace(\"\\n\", \" \").strip())\n",
    "                print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
    "            elif (key == \"title\"):\n",
    "                thedict[\"titles\"].append(val.replace(\"\\n\", \" \").strip())\n",
    "                print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
    "            elif (key == \"document_id\"):\n",
    "                thedict[\"document_id\"].append(val.replace(\"\\n\", \" \").strip())\n",
    "                print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
    "            elif (key == \"content\"):\n",
    "                thedict[\"answer\"] = val.replace(\"\\n\", \" \").strip()\n",
    "                print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
    "            elif (key == \"extraction_id\"):\n",
    "                thedict[\"extraction_id\"].append(val.replace(\"\\n\", \" \").strip())\n",
    "                print((\"\", \"Key -> {0}\\tValue -> {1}\".format(key,val)) [verbose])\n",
    "            else:\n",
    "                if (len(obj.items()) == 1 ):\n",
    "                    print(key, \" --> \", val)\n",
    "            iterate_json(val, thedict)\n",
    "    elif isinstance(obj, list):\n",
    "        for item in obj:\n",
    "            iterate_json(item, thedict)\n",
    "\n",
    "# this should be a json file with a list of input files and an output file\n",
    "with open(read_file, \"r\") as r_file:\n",
    "    result_file = json.load(r_file)\n",
    "\n",
    "ragas_output = {\n",
    "    \"titles\": [],\n",
    "    \"extraction_id\": [],\n",
    "    \"document_id\": [],\n",
    "    \"id\": [],\n",
    "    \"contexts\": [],\n",
    "    \"answer\": \"\",\n",
    "    \"question\": \"\"}\n",
    "vector_search_results = result_file[\"vector_search_results\"]\n",
    "choices = result_file[\"choices\"]\n",
    "iterate_json(vector_search_results, ragas_output)\n",
    "iterate_json(choices, ragas_output)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"titles\": [\n",
      "    \"2011 - Annotating individual human genomes.pdf\",\n",
      "    \"2011 - Annotating individual human genomes.pdf\",\n",
      "    \"2011 - Annotating individual human genomes.pdf\",\n",
      "    \"2012 - Systems Biology Approaches to Nutrition.pdf\",\n",
      "    \"2006 - \\u03b22-adrenergic receptor and UCP3 variants modulate the relationship between age and type 2 diabetes mellitus.pdf\",\n",
      "    \"2001 - Demography in the age of genomics.pdf\",\n",
      "    \"2012 - Systems Biology Approaches to Nutrition.pdf\",\n",
      "    \"2004 - A genome scan for diabetic nephropathy in African Americans.pdf\",\n",
      "    \"2011 - Annotating individual human genomes.pdf\",\n",
      "    \"2012 - Systems Biology Approaches to Nutrition.pdf\"\n",
      "  ],\n",
      "  \"extraction_id\": [\n",
      "    \"80d78615-8424-5478-a01b-73e220bc0345\",\n",
      "    \"80d78615-8424-5478-a01b-73e220bc0345\",\n",
      "    \"80d78615-8424-5478-a01b-73e220bc0345\",\n",
      "    \"eb3de845-98db-505c-bb7f-c0f3259875fc\",\n",
      "    \"acf69ed8-c7b0-5d9f-8005-de020c9cf699\",\n",
      "    \"5f24a851-1de6-5b6e-8230-2da08806b01a\",\n",
      "    \"eb3de845-98db-505c-bb7f-c0f3259875fc\",\n",
      "    \"01ec7832-8a80-5f5e-aa26-3648f572c4a1\",\n",
      "    \"c17c74fa-12a3-5072-bb48-c179055db14d\",\n",
      "    \"eb3de845-98db-505c-bb7f-c0f3259875fc\"\n",
      "  ],\n",
      "  \"document_id\": [\n",
      "    \"f7b5d738-3f0b-5074-9c21-f6b443b4e07f\",\n",
      "    \"f7b5d738-3f0b-5074-9c21-f6b443b4e07f\",\n",
      "    \"f7b5d738-3f0b-5074-9c21-f6b443b4e07f\",\n",
      "    \"6955478b-950d-5d29-b24c-3a5ca656f3ae\",\n",
      "    \"0ea34c04-5d09-5a32-89a7-c3add179927a\",\n",
      "    \"0f07fa43-feb6-5656-b7e7-b8faa86f5623\",\n",
      "    \"6955478b-950d-5d29-b24c-3a5ca656f3ae\",\n",
      "    \"5798fb6b-b3e6-57c4-9823-5428853dbfa1\",\n",
      "    \"f7b5d738-3f0b-5074-9c21-f6b443b4e07f\",\n",
      "    \"6955478b-950d-5d29-b24c-3a5ca656f3ae\"\n",
      "  ],\n",
      "  \"id\": [\n",
      "    \"7656b48b-d191-516e-9753-d34efedd4812\",\n",
      "    \"d1af5c82-d226-5980-b5d9-90d7558d1880\",\n",
      "    \"bb2a67ec-135b-5d55-b33d-74b1dc085685\",\n",
      "    \"11fc663d2-2833-51e7-ae6a-55b007a6e27c\",\n",
      "    \"bb55a705-7399-550e-8285-07c33654b909\",\n",
      "    \"9bff43c0-fd12-572e-9996-24957edd17d2\",\n",
      "    \"2df84ccc-0d32-582e-bda6-9cd46bee5378\",\n",
      "    \"ff30f187-d5c3-5d01-8026-0588a77e9f44\",\n",
      "    \"57dc1ee5-4252-52c3-92cb-e2ac36cdc4d6\",\n",
      "    \"ff801099-e737-57b1-91af-a4cea20adb87\"\n",
      "  ],\n",
      "  \"contexts\": [\n",
      "    \"gene interaction and high predictive value, PLoS One 3 (5) (2008) e2031,doi:10.1371/journal.pone.0002031 . [107] M. van Hoek, A. Dehghan, J.C. Witteman, C.M. van Duijn, A.G. Uitterlinden, B.A. Oostra, A. Hofman, E.J. Sijbrands, A.C. Janssens, Predicting type 2 diabetes based on polymorphisms from genome-wide association studies: a population-based study, Diabetes 57 (11) (Nov 2008) 3122 3128. [108] Q. Lu, Y. Song, X. Wang, S. Won, Y. Cui, R.C. Elston, The effect of multiple genetic\",\n",
      "    \"variants in predicting the risk of type 2 diabetes, BMC Proc 3 (Suppl 7) (Dec 15 2009) S49. [109] K. Miyake, W. Yang, K. Hara, K. Yasuda, Y. Horikawa, H. Osawa, H. Furuta, et al., Construction of a prediction model for type 2 diabetes mellitus in the Japanese population based on 11 genes with strong evidence of the association, J. Hum. Genet. 54 (4) (Apr 2009) 236 241 [Epub 2009 Feb 27]. [110] P.J. Talmud, A.D. Hingorani, J.A. Cooper, M.G. Marmot, E.J. Brunner, M. Kumari, M.\",\n",
      "    \"type 2 diabetes risk, Diabetes 57 (11) (Nov 2008) 3129 3135. [103] Q. Lu, R.C. Elston, Using the optimal receiver operating characteristic curve to design a predictive genetic test, exempli ed with type 2 diabetes, Am. J. Hum. Genet. 82 (3) (Mar 2008) 641 651. [104] V. Lyssenko, A. Jonsson, P. Almgren, N. Pulizzi, B. Isomaa, T. Tuomi, G. Berglund, D. Altshuler, P. Nilsson, L. Groop, Clinical risk factors, DNA variants, and the development of type 2 diabetes, N. Engl. J. Med. 359 (21) (Nov 20 2008)\",\n",
      "    \"insulin resistance, hypertension, and dyslipidemia (Obesity Education Initiative Expert Panel,  1998 ). Insulin resist-ance increases with age, and the incidence of diabetes rises sharply in the elderly (American Diabetes Association,  2010a ).   In a few patients, genetic mutations appear to be associ- ated with T2D (Roche  et al.  ,  2005 ; American Diabetes  Association,  2010a ). For example, recent work using the DPP data has led to the identi  cation of 27 single nucle-\",\n",
      "    \"19. Permutt MA, Wasson J, Cox N: Genetic epidemiology of diabe- tes.  J Clin Invest  2005, 115:1431-1439. 20. Barroso I: Genetics of Type 2 diabetes.   Diabet Med  2005, 22:517-535. 21. Parikh H, Groop L: Candidate genes for type 2 diabetes.   Rev Endocr Metab Disord  2004, 5:151-176. 22. Lohmueller KE, Pearce CL, Pike M, Lander ES, Hirschhorn JN: Meta- analysis of genetic association studies supports a contribu- tion of common variants to su sceptibility to common dis- ease.   Nat Genet  2003, 33:177-182.\",\n",
      "    \"insulin-dependent diabetes and schizophrenia, twin studies have demon-strated the existence of a significant genetic component (Kyvik et al., 1995;Plomin et al., 1994). Genetic factors also influence cardiovascular diseaseswhich occur in early or midlife, while for cardiovascular diseases occur-ring late in life there is little evidence of a genetic effect (Marenberg et al.,1994). Dementia has a very strong genetic component, not only withregard to early-onset monogenic types but also to late-onset\",\n",
      "    \"Three categories of increased risk of developing diabetes  are currently recognized by the ADA: an FPG between 5.6 and 6.9   mmol/L (100 and 125   mg/dL), de  ned as having  impaired fasting glucose (IFG); a 2 - h OGTT between 7.8 and 11   mmol/L (140 and 199   mg/dL), de  ned as having  impaired glucose tolerance (IGT); an A1C between 5.7 and 6.4% with values between 6.0 and 6.4 considered very high risk (American Diabetes Association,  2010a ).   It is estimated that approximately one - fourth of indi-\",\n",
      "    \"20 90 D20S451 0.006 10.7 5.4 (34) 8.42 5.4 (61) 0.30 (long duration) Interaction with age at diagnosis of diabetes 19 1 D1S1665 0.004 37.4 8.1 (66) 41.2 8.3 (81) 0.23 (early onset) 2 159 D2S1399/D2S1353 0.023 40.8 8.2 (53) 38.8 8.5 (94) 0.16 (late onset) 3 135 D3S2460 0.036 37.7 8.6 (66) 41.0 8.0 (81) 0.16 (early onset) 4 146 D4S1625 0.005 37.9 7.4 (52) 40.4 8.9 (95) 0.23 (early onset) 65 5 D6S2427 0.024 38.0 7.8 (70) 40.9 8.8 (77) 0.18 (early onset)\",\n",
      "    \"prevention, even though there are great concerns about how such information will be perceived and utilized [68]. Recent studies in diabetes and cardiovascular disease suggest that the addition of previously associated genetic risk loci in clinical risk models of those diseases increases both discriminative and predictive accuracy, albeit only marginally. Typically, the strongest predictors of disease onset are known clinical risk factors such as body mass index, age, or\",\n",
      "    \"and sex, weight for height greater than the 85th percentile, or weight greater than 120% of ideal for height) who also have any two of the following risk factors: family history of T2D (  rst -  or second - degree relative), maternal history of diabetes or GDM during child  s gestation, member of high - risk race/ethnicity group (Native American, African -(40   in) in men is considered an indicator of increased\"\n",
      "  ],\n",
      "  \"answer\": \"The factors that help predict early onset of diabetes include:  1. **Genetic Factors**: Specific genetic mutations and polymorphisms have been associated with an increased risk of developing type 2 diabetes (T2D) [1], [2], [4], [5], [6], [8]. 2. **Clinical Risk Factors**: Known clinical risk factors such as body mass index (BMI), age, and sex are strong predictors of disease onset [9]. 3. **Family History**: A family history of T2D, particularly in first- or second-degree relatives, is a significant risk factor [10]. 4. **Maternal History**: A maternal history of diabetes or gestational diabetes mellitus (GDM) during the childs gestation increases the risk [10]. 5. **Ethnicity**: Being a member of a high-risk race/ethnicity group, such as Native American or African American, is associated with a higher risk [10]. 6. **Impaired Glucose Levels**: Impaired fasting glucose (IFG), impaired glucose tolerance (IGT), and elevated A1C levels are indicators of increased risk [7]. 7. **Insulin Resistance and Related Conditions**: Conditions such as insulin resistance, hypertension, and dyslipidemia are linked to a higher risk of developing diabetes [4].  These factors collectively contribute to the prediction of early onset diabetes.\",\n",
      "  \"question\": \"List any factors that help predict early onset of diabetes.\"\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(json.dumps(ragas_output, indent=2))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}