aboutsummaryrefslogtreecommitdiff
path: root/gnqa/paper2_eval/data/scores
diff options
context:
space:
mode:
authorShelbySolomonDarnell2024-10-17 12:24:26 +0300
committerShelbySolomonDarnell2024-10-17 12:24:26 +0300
commit00cba4b9a1e88891f1f96a1199320092c1962343 (patch)
tree270fd06daa18b2fc5687ee72d912cad771354bb0 /gnqa/paper2_eval/data/scores
parente0b2b0e55049b89805f73f291df1e28fa05487fe (diff)
downloadgn-ai-master.tar.gz
Docker image built to run code, all evals run using R2RHEADmaster
Diffstat (limited to 'gnqa/paper2_eval/data/scores')
-rw-r--r--gnqa/paper2_eval/data/scores/de_aging_2.json19
-rw-r--r--gnqa/paper2_eval/data/scores/gpt4o_cs_aging_score.json19
-rw-r--r--gnqa/paper2_eval/data/scores/gpt4o_cs_diabetes_score.json19
-rw-r--r--gnqa/paper2_eval/data/scores/gpt4o_cs_gn_score.json19
-rw-r--r--gnqa/paper2_eval/data/scores/gpt4o_de_aging_score.json19
-rw-r--r--gnqa/paper2_eval/data/scores/gpt4o_de_diabetes_score.json19
-rw-r--r--gnqa/paper2_eval/data/scores/gpt4o_de_gn_score.json19
-rw-r--r--gnqa/paper2_eval/data/scores/human_cs_aging_score.json19
-rw-r--r--gnqa/paper2_eval/data/scores/human_cs_diabetes_score.json19
-rw-r--r--gnqa/paper2_eval/data/scores/human_cs_gn_score.json19
-rw-r--r--gnqa/paper2_eval/data/scores/human_de_aging_score.json19
-rw-r--r--gnqa/paper2_eval/data/scores/human_de_diabetes_score.json19
-rw-r--r--gnqa/paper2_eval/data/scores/human_de_gn_score.json19
13 files changed, 247 insertions, 0 deletions
diff --git a/gnqa/paper2_eval/data/scores/de_aging_2.json b/gnqa/paper2_eval/data/scores/de_aging_2.json
new file mode 100644
index 0000000..5c3b9b7
--- /dev/null
+++ b/gnqa/paper2_eval/data/scores/de_aging_2.json
@@ -0,0 +1,19 @@
+,
+{
+ "faithfulness": 0.8571428571428571,
+ "answer_relevancy": 0.9949956999959797,
+ "context_relevancy": 0.016129032258064516,
+ "context_utilization": 0.9888888888779013
+},
+{
+ "faithfulness": 1.0,
+ "answer_relevancy": 0.9927347208847189,
+ "context_relevancy": 0.016129032258064516,
+ "context_utilization": 0.9888888888779013
+},
+{
+ "faithfulness": 1.0,
+ "answer_relevancy": 0.9904737417734579,
+ "context_relevancy": 0.016129032258064516,
+ "context_utilization": 0.8227678571325725
+} \ No newline at end of file
diff --git a/gnqa/paper2_eval/data/scores/gpt4o_cs_aging_score.json b/gnqa/paper2_eval/data/scores/gpt4o_cs_aging_score.json
new file mode 100644
index 0000000..054bd65
--- /dev/null
+++ b/gnqa/paper2_eval/data/scores/gpt4o_cs_aging_score.json
@@ -0,0 +1,19 @@
+,
+{
+ "faithfulness": 0.8435247647089751,
+ "answer_relevancy": 0.9881089261370415,
+ "context_relevancy": 0.0626087109471981,
+ "context_utilization": 0.9999999999899998
+},
+{
+ "faithfulness": 0.8861904761904761,
+ "answer_relevancy": 0.9896018313744044,
+ "context_relevancy": 0.06451347285195999,
+ "context_utilization": 0.9999999999899998
+},
+{
+ "faithfulness": 0.8207883344725448,
+ "answer_relevancy": 0.9881842770232447,
+ "context_relevancy": 0.06265981914307604,
+ "context_utilization": 0.9999999999899998
+} \ No newline at end of file
diff --git a/gnqa/paper2_eval/data/scores/gpt4o_cs_diabetes_score.json b/gnqa/paper2_eval/data/scores/gpt4o_cs_diabetes_score.json
new file mode 100644
index 0000000..0226f01
--- /dev/null
+++ b/gnqa/paper2_eval/data/scores/gpt4o_cs_diabetes_score.json
@@ -0,0 +1,19 @@
+,
+{
+ "faithfulness": 0.9207323232323231,
+ "answer_relevancy": 0.9753090544756589,
+ "context_relevancy": 0.058441912911640505,
+ "context_utilization": 0.9860918997127556
+},
+{
+ "faithfulness": 0.9451515151515151,
+ "answer_relevancy": 0.9763192509534061,
+ "context_relevancy": 0.055226068472391796,
+ "context_utilization": 0.983296406515531
+},
+{
+ "faithfulness": 0.9375396825396827,
+ "answer_relevancy": 0.9753865446862534,
+ "context_relevancy": 0.05655290829923163,
+ "context_utilization": 0.9761685090602639
+} \ No newline at end of file
diff --git a/gnqa/paper2_eval/data/scores/gpt4o_cs_gn_score.json b/gnqa/paper2_eval/data/scores/gpt4o_cs_gn_score.json
new file mode 100644
index 0000000..0584c79
--- /dev/null
+++ b/gnqa/paper2_eval/data/scores/gpt4o_cs_gn_score.json
@@ -0,0 +1,19 @@
+,
+{
+ "faithfulness": 0.8976182844932845,
+ "answer_relevancy": 0.8718703112058132,
+ "context_relevancy": 0.061916430658313815,
+ "context_utilization": 0.8942353237499372
+},
+{
+ "faithfulness": 0.8984067321567322,
+ "answer_relevancy": 0.8693138098899877,
+ "context_relevancy": 0.06339331584209555,
+ "context_utilization": 0.8957234189880159
+},
+{
+ "faithfulness": 0.9011500305250306,
+ "answer_relevancy": 0.8734160966030811,
+ "context_relevancy": 0.06326778200966515,
+ "context_utilization": 0.8973390652465808
+} \ No newline at end of file
diff --git a/gnqa/paper2_eval/data/scores/gpt4o_de_aging_score.json b/gnqa/paper2_eval/data/scores/gpt4o_de_aging_score.json
new file mode 100644
index 0000000..752716c
--- /dev/null
+++ b/gnqa/paper2_eval/data/scores/gpt4o_de_aging_score.json
@@ -0,0 +1,19 @@
+,
+{
+ "faithfulness": 0.8916292041292042,
+ "answer_relevancy": 0.9622909267407932,
+ "context_relevancy": 0.056009119488898904,
+ "context_utilization": 0.9949374448752393
+},
+{
+ "faithfulness": 0.9239045676545677,
+ "answer_relevancy": 0.9602574645814024,
+ "context_relevancy": 0.056009119488898904,
+ "context_utilization": 0.9999999999899443
+},
+{
+ "faithfulness": 0.9022054334554334,
+ "answer_relevancy": 0.9621900912593574,
+ "context_relevancy": 0.05556803279781221,
+ "context_utilization": 0.9964131393197346
+} \ No newline at end of file
diff --git a/gnqa/paper2_eval/data/scores/gpt4o_de_diabetes_score.json b/gnqa/paper2_eval/data/scores/gpt4o_de_diabetes_score.json
new file mode 100644
index 0000000..18c3e33
--- /dev/null
+++ b/gnqa/paper2_eval/data/scores/gpt4o_de_diabetes_score.json
@@ -0,0 +1,19 @@
+,
+{
+ "faithfulness": 0.8694443056943058,
+ "answer_relevancy": 0.9143466126793479,
+ "context_relevancy": 0.03506694580871902,
+ "context_utilization": 0.9838784170972874
+},
+{
+ "faithfulness": 0.8246933621933621,
+ "answer_relevancy": 0.915552384671478,
+ "context_relevancy": 0.0329103695083071,
+ "context_utilization": 0.9903549382614113
+},
+{
+ "faithfulness": 0.8755350899100899,
+ "answer_relevancy": 0.9637196237550363,
+ "context_relevancy": 0.0343820143018697,
+ "context_utilization": 0.9894689704483846
+} \ No newline at end of file
diff --git a/gnqa/paper2_eval/data/scores/gpt4o_de_gn_score.json b/gnqa/paper2_eval/data/scores/gpt4o_de_gn_score.json
new file mode 100644
index 0000000..2cc53f9
--- /dev/null
+++ b/gnqa/paper2_eval/data/scores/gpt4o_de_gn_score.json
@@ -0,0 +1,19 @@
+,
+{
+ "faithfulness": 0.8297535103785105,
+ "answer_relevancy": 0.872067854405554,
+ "context_relevancy": 0.03650042049847416,
+ "context_utilization": 0.9308504975963435
+},
+{
+ "faithfulness": 0.7872641941391942,
+ "answer_relevancy": 0.8733484807158058,
+ "context_relevancy": 0.03650042049847416,
+ "context_utilization": 0.9272795414361721
+},
+{
+ "faithfulness": 0.7596802503052503,
+ "answer_relevancy": 0.8726434115697865,
+ "context_relevancy": 0.03650042049847416,
+ "context_utilization": 0.926490378548729
+} \ No newline at end of file
diff --git a/gnqa/paper2_eval/data/scores/human_cs_aging_score.json b/gnqa/paper2_eval/data/scores/human_cs_aging_score.json
new file mode 100644
index 0000000..dfd7853
--- /dev/null
+++ b/gnqa/paper2_eval/data/scores/human_cs_aging_score.json
@@ -0,0 +1,19 @@
+,
+{
+ "faithfulness": 0.8425019425019424,
+ "answer_relevancy": 0.952484152627895,
+ "context_relevancy": 0.04961625911070337,
+ "context_utilization": 0.9981956315188991
+},
+{
+ "faithfulness": 0.871989121989122,
+ "answer_relevancy": 0.9531096223056006,
+ "context_relevancy": 0.05031277271044277,
+ "context_utilization": 0.9953228869794345
+},
+{
+ "faithfulness": 0.8605672105672105,
+ "answer_relevancy": 0.9564885313193343,
+ "context_relevancy": 0.0482337706314407,
+ "context_utilization": 0.9981956315188991
+} \ No newline at end of file
diff --git a/gnqa/paper2_eval/data/scores/human_cs_diabetes_score.json b/gnqa/paper2_eval/data/scores/human_cs_diabetes_score.json
new file mode 100644
index 0000000..8974d29
--- /dev/null
+++ b/gnqa/paper2_eval/data/scores/human_cs_diabetes_score.json
@@ -0,0 +1,19 @@
+,
+{
+ "faithfulness": 0.719520757020757,
+ "answer_relevancy": 0.9516275767101735,
+ "context_relevancy": 0.0490614785763392,
+ "context_utilization": 0.9841827876884541
+},
+{
+ "faithfulness": 0.7050892857142858,
+ "answer_relevancy": 0.957680303141668,
+ "context_relevancy": 0.04485272436758499,
+ "context_utilization": 0.9900766093374835
+},
+{
+ "faithfulness": 0.7229868742368742,
+ "answer_relevancy": 0.9564961457687489,
+ "context_relevancy": 0.04876279279378244,
+ "context_utilization": 0.9841827876884541
+} \ No newline at end of file
diff --git a/gnqa/paper2_eval/data/scores/human_cs_gn_score.json b/gnqa/paper2_eval/data/scores/human_cs_gn_score.json
new file mode 100644
index 0000000..2f9f47a
--- /dev/null
+++ b/gnqa/paper2_eval/data/scores/human_cs_gn_score.json
@@ -0,0 +1,19 @@
+,
+{
+ "faithfulness": 0.8040619953533557,
+ "answer_relevancy": 0.933496097114236,
+ "context_relevancy": 0.04967998901244118,
+ "context_utilization": 0.9402004942497864
+},
+{
+ "faithfulness": 0.8140579688144026,
+ "answer_relevancy": 0.9362523112695514,
+ "context_relevancy": 0.04934854961850179,
+ "context_utilization": 0.9462251639555944
+},
+{
+ "faithfulness": 0.8346159741927046,
+ "answer_relevancy": 0.9378502646867282,
+ "context_relevancy": 0.050870465202917374,
+ "context_utilization": 0.9566437990021893
+} \ No newline at end of file
diff --git a/gnqa/paper2_eval/data/scores/human_de_aging_score.json b/gnqa/paper2_eval/data/scores/human_de_aging_score.json
new file mode 100644
index 0000000..8a9abf3
--- /dev/null
+++ b/gnqa/paper2_eval/data/scores/human_de_aging_score.json
@@ -0,0 +1,19 @@
+,
+{
+ "faithfulness": 0.6266788766788767,
+ "answer_relevancy": 0.9706128992746693,
+ "context_relevancy": 0.045458016797918395,
+ "context_utilization": 0.9861919459039513
+},
+{
+ "faithfulness": 0.7455128205128205,
+ "answer_relevancy": 0.9737920787712793,
+ "context_relevancy": 0.040249683464585066,
+ "context_utilization": 0.9861919459039513
+},
+{
+ "faithfulness": 0.6762223283962414,
+ "answer_relevancy": 0.9763265441772212,
+ "context_relevancy": 0.045458016797918395,
+ "context_utilization": 0.9496353247987496
+} \ No newline at end of file
diff --git a/gnqa/paper2_eval/data/scores/human_de_diabetes_score.json b/gnqa/paper2_eval/data/scores/human_de_diabetes_score.json
new file mode 100644
index 0000000..8f86e84
--- /dev/null
+++ b/gnqa/paper2_eval/data/scores/human_de_diabetes_score.json
@@ -0,0 +1,19 @@
+,
+{
+ "faithfulness": 0.8657178469678469,
+ "answer_relevancy": 0.9698773165539708,
+ "context_relevancy": 0.03165020943114643,
+ "context_utilization": 0.980117182190157
+},
+{
+ "faithfulness": 0.8578238001314925,
+ "answer_relevancy": 0.9720084862982596,
+ "context_relevancy": 0.03536219511987631,
+ "context_utilization": 0.9831616300264557
+},
+{
+ "faithfulness": 0.7997557997557998,
+ "answer_relevancy": 0.9715015597017967,
+ "context_relevancy": 0.033439118196799386,
+ "context_utilization": 0.9790216727612565
+} \ No newline at end of file
diff --git a/gnqa/paper2_eval/data/scores/human_de_gn_score.json b/gnqa/paper2_eval/data/scores/human_de_gn_score.json
new file mode 100644
index 0000000..0155ee1
--- /dev/null
+++ b/gnqa/paper2_eval/data/scores/human_de_gn_score.json
@@ -0,0 +1,19 @@
+,
+{
+ "faithfulness": 0.9060218485218485,
+ "answer_relevancy": 0.9180706452518813,
+ "context_relevancy": 0.04851549372256422,
+ "context_utilization": 0.9846595919399526
+},
+{
+ "faithfulness": 0.9164201118746573,
+ "answer_relevancy": 0.9191154003593898,
+ "context_relevancy": 0.04842257317509822,
+ "context_utilization": 0.978182619837846
+},
+{
+ "faithfulness": 0.9114957337449501,
+ "answer_relevancy": 0.9201106080009104,
+ "context_relevancy": 0.04557179296455012,
+ "context_utilization": 0.9875496031612268
+} \ No newline at end of file