From 835e229909e9bdb6e084c5112672065886517adb Mon Sep 17 00:00:00 2001 From: Nyeusi D. Shebes Date: Thu, 27 Feb 2025 22:04:47 -0600 Subject: refactoring codebase --- gnqa/data/study1/results/eval2_general1.json | 7 ++++ gnqa/data/study1/results/eval2_general2.json | 13 ++++++++ gnqa/data/study1/results/eval_aging1.json | 19 +++++++++++ gnqa/data/study1/results/eval_aging2.json | 19 +++++++++++ gnqa/data/study1/results/eval_experts_aging1.json | 18 ++++++++++ gnqa/data/study1/results/eval_experts_aging2.json | 18 ++++++++++ .../data/study1/results/eval_experts_general1.json | 19 +++++++++++ .../data/study1/results/eval_experts_general2.json | 19 +++++++++++ gnqa/data/study1/results/eval_experts_suga1.json | 18 ++++++++++ gnqa/data/study1/results/eval_general1.json | 18 ++++++++++ gnqa/data/study1/results/eval_general2.json | 18 ++++++++++ gnqa/data/study1/results/eval_suga1.json | 19 +++++++++++ gnqa/data/study1/results/eval_suga2.json | 19 +++++++++++ gnqa/data/study1/results/eval_sugaA.json | 7 ++++ gnqa/data/study1/results/gemma_eval_general1.json | 7 ++++ gnqa/data/study1/results/gemma_eval_general2.json | 7 ++++ .../results/gpt4o/gpt4o_eval_cs_aging_1.json | 19 +++++++++++ .../results/gpt4o/gpt4o_eval_cs_aging_2.json | 19 +++++++++++ .../results/gpt4o/gpt4o_eval_cs_aging_3.json | 19 +++++++++++ .../results/gpt4o/gpt4o_eval_cs_aging_4.json | 19 +++++++++++ .../results/gpt4o/gpt4o_eval_cs_diabetes_1.json | 19 +++++++++++ .../results/gpt4o/gpt4o_eval_cs_diabetes_2.json | 19 +++++++++++ .../results/gpt4o/gpt4o_eval_cs_diabetes_3.json | 19 +++++++++++ .../results/gpt4o/gpt4o_eval_cs_diabetes_4.json | 19 +++++++++++ .../study1/results/gpt4o/gpt4o_eval_cs_gn_1.json | 19 +++++++++++ .../study1/results/gpt4o/gpt4o_eval_cs_gn_3.json | 19 +++++++++++ .../study1/results/gpt4o/gpt4o_eval_cs_gn_4.json | 19 +++++++++++ .../results/gpt4o/gpt4o_eval_de_aging_1.json | 19 +++++++++++ .../results/gpt4o/gpt4o_eval_de_aging_2.json | 19 +++++++++++ .../results/gpt4o/gpt4o_eval_de_aging_3.json | 19 +++++++++++ .../results/gpt4o/gpt4o_eval_de_aging_4.json | 19 +++++++++++ .../results/gpt4o/gpt4o_eval_de_diabetes_1.json | 20 +++++++++++ .../results/gpt4o/gpt4o_eval_de_diabetes_2.json | 20 +++++++++++ .../results/gpt4o/gpt4o_eval_de_diabetes_3.json | 20 +++++++++++ .../results/gpt4o/gpt4o_eval_de_diabetes_4.json | 20 +++++++++++ .../study1/results/gpt4o/gpt4o_eval_de_gn_3.json | 19 +++++++++++ .../study1/results/gpt4o/scores_cs_diabetes.json | 37 ++++++++++++++++++++ .../study1/results/human/scores_cs_aging_1.json | 19 +++++++++++ .../study1/results/human/scores_cs_aging_2.json | 19 +++++++++++ .../study1/results/human/scores_cs_aging_3.json | 19 +++++++++++ .../study1/results/human/scores_cs_diabetes_1.json | 19 +++++++++++ .../study1/results/human/scores_cs_diabetes_2.json | 19 +++++++++++ .../study1/results/human/scores_cs_diabetes_3.json | 19 +++++++++++ .../study1/results/human/scores_cs_diabetes_4.json | 39 ++++++++++++++++++++++ gnqa/data/study1/results/human/scores_cs_gn_1.json | 14 ++++++++ gnqa/data/study1/results/human/scores_cs_gn_2.json | 20 +++++++++++ gnqa/data/study1/results/human/scores_cs_gn_3.json | 25 ++++++++++++++ gnqa/data/study1/results/human/scores_cs_gn_4.json | 19 +++++++++++ gnqa/data/study1/results/human/scores_cs_gn_5.json | 19 +++++++++++ gnqa/data/study1/results/human/scores_cs_gn_6.json | 19 +++++++++++ gnqa/data/study1/results/human/scores_cs_gn_7.json | 18 ++++++++++ .../study1/results/human/scores_de_aging_1.json | 19 +++++++++++ .../study1/results/human/scores_de_aging_2.json | 19 +++++++++++ .../results/human/scores_de_diabetes_1.1.json | 19 +++++++++++ .../study1/results/human/scores_de_diabetes_1.json | 19 +++++++++++ .../study1/results/human/scores_de_diabetes_2.json | 19 +++++++++++ .../study1/results/human/scores_de_gn_1.1.json | 19 +++++++++++ gnqa/data/study1/results/human/scores_de_gn_1.json | 18 ++++++++++ gnqa/data/study1/results/human/scores_de_gn_2.json | 18 ++++++++++ gnqa/data/study1/results/human/scores_de_gn_3.json | 19 +++++++++++ gnqa/data/study1/results/human/scores_de_gn_4.json | 19 +++++++++++ gnqa/data/study1/results/human/scores_de_gn_5.json | 19 +++++++++++ gnqa/data/study1/results/human/scores_de_gn_6.json | 19 +++++++++++ gnqa/data/study1/results/llamaeval_general1.json | 13 ++++++++ gnqa/data/study1/results/results.json | 20 +++++++++++ gnqa/data/study1/results/results_aging.json | 19 +++++++++++ gnqa/data/study1/results/test.json | 19 +++++++++++ gnqa/data/study1/results/test2.json | 19 +++++++++++ 68 files changed, 1269 insertions(+) create mode 100644 gnqa/data/study1/results/eval2_general1.json create mode 100644 gnqa/data/study1/results/eval2_general2.json create mode 100644 gnqa/data/study1/results/eval_aging1.json create mode 100644 gnqa/data/study1/results/eval_aging2.json create mode 100644 gnqa/data/study1/results/eval_experts_aging1.json create mode 100644 gnqa/data/study1/results/eval_experts_aging2.json create mode 100644 gnqa/data/study1/results/eval_experts_general1.json create mode 100644 gnqa/data/study1/results/eval_experts_general2.json create mode 100644 gnqa/data/study1/results/eval_experts_suga1.json create mode 100644 gnqa/data/study1/results/eval_general1.json create mode 100644 gnqa/data/study1/results/eval_general2.json create mode 100644 gnqa/data/study1/results/eval_suga1.json create mode 100644 gnqa/data/study1/results/eval_suga2.json create mode 100644 gnqa/data/study1/results/eval_sugaA.json create mode 100644 gnqa/data/study1/results/gemma_eval_general1.json create mode 100644 gnqa/data/study1/results/gemma_eval_general2.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_1.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_2.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_3.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_4.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_1.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_2.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_3.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_4.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_gn_1.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_gn_3.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_gn_4.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_1.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_2.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_3.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_4.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_1.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_2.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_3.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_4.json create mode 100644 gnqa/data/study1/results/gpt4o/gpt4o_eval_de_gn_3.json create mode 100644 gnqa/data/study1/results/gpt4o/scores_cs_diabetes.json create mode 100644 gnqa/data/study1/results/human/scores_cs_aging_1.json create mode 100644 gnqa/data/study1/results/human/scores_cs_aging_2.json create mode 100644 gnqa/data/study1/results/human/scores_cs_aging_3.json create mode 100644 gnqa/data/study1/results/human/scores_cs_diabetes_1.json create mode 100644 gnqa/data/study1/results/human/scores_cs_diabetes_2.json create mode 100644 gnqa/data/study1/results/human/scores_cs_diabetes_3.json create mode 100644 gnqa/data/study1/results/human/scores_cs_diabetes_4.json create mode 100644 gnqa/data/study1/results/human/scores_cs_gn_1.json create mode 100644 gnqa/data/study1/results/human/scores_cs_gn_2.json create mode 100644 gnqa/data/study1/results/human/scores_cs_gn_3.json create mode 100644 gnqa/data/study1/results/human/scores_cs_gn_4.json create mode 100644 gnqa/data/study1/results/human/scores_cs_gn_5.json create mode 100644 gnqa/data/study1/results/human/scores_cs_gn_6.json create mode 100644 gnqa/data/study1/results/human/scores_cs_gn_7.json create mode 100644 gnqa/data/study1/results/human/scores_de_aging_1.json create mode 100644 gnqa/data/study1/results/human/scores_de_aging_2.json create mode 100644 gnqa/data/study1/results/human/scores_de_diabetes_1.1.json create mode 100644 gnqa/data/study1/results/human/scores_de_diabetes_1.json create mode 100644 gnqa/data/study1/results/human/scores_de_diabetes_2.json create mode 100644 gnqa/data/study1/results/human/scores_de_gn_1.1.json create mode 100644 gnqa/data/study1/results/human/scores_de_gn_1.json create mode 100644 gnqa/data/study1/results/human/scores_de_gn_2.json create mode 100644 gnqa/data/study1/results/human/scores_de_gn_3.json create mode 100644 gnqa/data/study1/results/human/scores_de_gn_4.json create mode 100644 gnqa/data/study1/results/human/scores_de_gn_5.json create mode 100644 gnqa/data/study1/results/human/scores_de_gn_6.json create mode 100644 gnqa/data/study1/results/llamaeval_general1.json create mode 100644 gnqa/data/study1/results/results.json create mode 100644 gnqa/data/study1/results/results_aging.json create mode 100644 gnqa/data/study1/results/test.json create mode 100644 gnqa/data/study1/results/test2.json (limited to 'gnqa/data/study1/results') diff --git a/gnqa/data/study1/results/eval2_general1.json b/gnqa/data/study1/results/eval2_general1.json new file mode 100644 index 0000000..9c8dd91 --- /dev/null +++ b/gnqa/data/study1/results/eval2_general1.json @@ -0,0 +1,7 @@ +, +{ + "faithfulness": 0.7428571428571429, + "answer_relevancy": 0.9780678036268498, + "context_relevancy": 0.09343441716165339, + "context_utilization": 0.816596788224676 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/eval2_general2.json b/gnqa/data/study1/results/eval2_general2.json new file mode 100644 index 0000000..face395 --- /dev/null +++ b/gnqa/data/study1/results/eval2_general2.json @@ -0,0 +1,13 @@ +, +{ + "faithfulness": NaN, + "answer_relevancy": NaN, + "context_relevancy": 0.10210226586398571, + "context_utilization": NaN +}, +{ + "faithfulness": 0.85, + "answer_relevancy": 0.6948351748903157, + "context_relevancy": 0.09669216181532704, + "context_utilization": 0.7730960707226785 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/eval_aging1.json b/gnqa/data/study1/results/eval_aging1.json new file mode 100644 index 0000000..7f020f8 --- /dev/null +++ b/gnqa/data/study1/results/eval_aging1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.90332619492291, + "context_relevancy": 0.16311053327554975, + "context_utilization": 0.9695800984320362 +}, +{ + "faithfulness": 0.9777777777777779, + "answer_relevancy": 0.9152650172290191, + "context_relevancy": 0.17545621228789543, + "context_utilization": 0.9695800984320362 +}, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.9207411197703179, + "context_relevancy": 0.19377271060439374, + "context_utilization": 0.9695800984320362 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/eval_aging2.json b/gnqa/data/study1/results/eval_aging2.json new file mode 100644 index 0000000..5cf1f31 --- /dev/null +++ b/gnqa/data/study1/results/eval_aging2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.9131945711490829, + "context_relevancy": 0.0843248379163872, + "context_utilization": 0.8269904041235476 +}, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.9073113293523962, + "context_relevancy": 0.0843248379163872, + "context_utilization": 0.833091604265284 +}, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.909257413921701, + "context_relevancy": 0.0843248379163872, + "context_utilization": 0.833091604265284 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/eval_experts_aging1.json b/gnqa/data/study1/results/eval_experts_aging1.json new file mode 100644 index 0000000..19bfc90 --- /dev/null +++ b/gnqa/data/study1/results/eval_experts_aging1.json @@ -0,0 +1,18 @@ +{ + "faithfulness": 0.8742857142857143, + "answer_relevancy": 0.9678684040431473, + "context_relevancy": 0.2085018446737963, + "context_utilization": 0.9272852892960846 +}, +{ + "faithfulness": 0.8742857142857143, + "answer_relevancy": 0.9685606717668597, + "context_relevancy": 0.20135898753093917, + "context_utilization": 0.9272852892960846 +}, +{ + "faithfulness": 0.8742857142857143, + "answer_relevancy": 0.9690321094868484, + "context_relevancy": 0.20135898753093917, + "context_utilization": 0.9260832100237781 +} diff --git a/gnqa/data/study1/results/eval_experts_aging2.json b/gnqa/data/study1/results/eval_experts_aging2.json new file mode 100644 index 0000000..02c1939 --- /dev/null +++ b/gnqa/data/study1/results/eval_experts_aging2.json @@ -0,0 +1,18 @@ +{ + "faithfulness": 0.9714285714285715, + "answer_relevancy": 0.9655810278750667, + "context_relevancy": 0.22941000299490866, + "context_utilization": 0.9589677983113123 +}, +{ + "faithfulness": 0.9560439560439562, + "answer_relevancy": 0.9751092927895293, + "context_relevancy": 0.22941000299490866, + "context_utilization": 0.9589677983113123 +}, +{ + "faithfulness": 0.9560439560439562, + "answer_relevancy": 0.9751092927895293, + "context_relevancy": 0.23207666966157534, + "context_utilization": 0.9516178189920771 +} diff --git a/gnqa/data/study1/results/eval_experts_general1.json b/gnqa/data/study1/results/eval_experts_general1.json new file mode 100644 index 0000000..1bba1d5 --- /dev/null +++ b/gnqa/data/study1/results/eval_experts_general1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.9053928340589652, + "context_relevancy": 0.2827950558213716, + "context_utilization": 0.7705234648910072 +}, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.9157326745735066, + "context_relevancy": 0.2652511961722488, + "context_utilization": 0.7705234648910072 +}, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.9096674856564787, + "context_relevancy": 0.25472488038277513, + "context_utilization": 0.7705234648910072 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/eval_experts_general2.json b/gnqa/data/study1/results/eval_experts_general2.json new file mode 100644 index 0000000..00aea70 --- /dev/null +++ b/gnqa/data/study1/results/eval_experts_general2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.8, + "answer_relevancy": 0.903335063636181, + "context_relevancy": 0.056258225526498694, + "context_utilization": 0.46176446463288745 +}, +{ + "faithfulness": 0.7666666666666667, + "answer_relevancy": 0.904390101613252, + "context_relevancy": 0.08775428851862468, + "context_utilization": 0.4464446356339682 +}, +{ + "faithfulness": 0.8, + "answer_relevancy": 0.9086449278497206, + "context_relevancy": 0.056258225526498694, + "context_utilization": 0.46176446463288745 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/eval_experts_suga1.json b/gnqa/data/study1/results/eval_experts_suga1.json new file mode 100644 index 0000000..cfabf1a --- /dev/null +++ b/gnqa/data/study1/results/eval_experts_suga1.json @@ -0,0 +1,18 @@ +{ + "faithfulness": 0.9612, + "answer_relevancy": 0.9295, + "context_relevancy": 0.1995, + "context_utilization": 0.842090248282362 +}, +{ + "faithfulness": 0.9612403100775193, + "answer_relevancy": 0.9266841312155393, + "context_relevancy": 0.21207858802198423, + "context_utilization": 0.842090248282362 +}, +{ + "faithfulness": 0.9612403100775193, + "answer_relevancy": 0.9284770424352974, + "context_relevancy": 0.2014315773749736, + "context_utilization": 0.842090248282362 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/eval_general1.json b/gnqa/data/study1/results/eval_general1.json new file mode 100644 index 0000000..80dbfc5 --- /dev/null +++ b/gnqa/data/study1/results/eval_general1.json @@ -0,0 +1,18 @@ +{ + "faithfulness": 0.6, + "answer_relevancy": 0.9801126654000318, + "context_relevancy": 0.09178152459966993, + "context_utilization": 0.8517819734097796 +}, +{ + "faithfulness": 0.6, + "answer_relevancy": 0.9825744284107565, + "context_relevancy": 0.09178152459966993, + "context_utilization": 0.816596788224676 +}, +{ + "faithfulness": 0.6, + "answer_relevancy": 0.9804185355149768, + "context_relevancy": 0.09065663938387562, + "context_utilization": 0.8517819734097796 +} diff --git a/gnqa/data/study1/results/eval_general2.json b/gnqa/data/study1/results/eval_general2.json new file mode 100644 index 0000000..51665e3 --- /dev/null +++ b/gnqa/data/study1/results/eval_general2.json @@ -0,0 +1,18 @@ +{ + "faithfulness": 0.85, + "answer_relevancy": 0.6941347949549538, + "context_relevancy": 0.09669216181532704, + "context_utilization": 0.7730960707226785 +}, +{ + "faithfulness": 0.85, + "answer_relevancy": 0.6934750290194251, + "context_relevancy": 0.13879742497322178, + "context_utilization": 0.7730960707226785 +}, +{ + "faithfulness": 0.85, + "answer_relevancy": 0.6943081762253429, + "context_relevancy": 0.09669216181532704, + "context_utilization": 0.7730960707226785 +} diff --git a/gnqa/data/study1/results/eval_suga1.json b/gnqa/data/study1/results/eval_suga1.json new file mode 100644 index 0000000..3e162d0 --- /dev/null +++ b/gnqa/data/study1/results/eval_suga1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.9363046208472652, + "context_relevancy": 0.10308941188546791, + "context_utilization": 0.938356611481667 +}, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.9387937731939724, + "context_relevancy": 0.10308941188546791, + "context_utilization": 0.9662574794748956 +}, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.9372333468729981, + "context_relevancy": 0.10308941188546791, + "context_utilization": 0.9421623086941493 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/eval_suga2.json b/gnqa/data/study1/results/eval_suga2.json new file mode 100644 index 0000000..4ea2aa2 --- /dev/null +++ b/gnqa/data/study1/results/eval_suga2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.9318400456917242, + "context_relevancy": 0.12194071444495894, + "context_utilization": 0.9657545215065534 +}, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.9269052398452946, + "context_relevancy": 0.12194071444495894, + "context_utilization": 0.9657545215065534 +}, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.9326698973133014, + "context_relevancy": 0.11492317058530979, + "context_utilization": 0.9717723548657957 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/eval_sugaA.json b/gnqa/data/study1/results/eval_sugaA.json new file mode 100644 index 0000000..fda4de7 --- /dev/null +++ b/gnqa/data/study1/results/eval_sugaA.json @@ -0,0 +1,7 @@ +, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.9332465603795168, + "context_relevancy": 0.17527404777829225, + "context_utilization": 0.9832121070042665 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gemma_eval_general1.json b/gnqa/data/study1/results/gemma_eval_general1.json new file mode 100644 index 0000000..6b13c83 --- /dev/null +++ b/gnqa/data/study1/results/gemma_eval_general1.json @@ -0,0 +1,7 @@ +, +{ + "faithfulness": NaN, + "answer_relevancy": NaN, + "context_relevancy": 0.017839778759088275, + "context_utilization": NaN +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gemma_eval_general2.json b/gnqa/data/study1/results/gemma_eval_general2.json new file mode 100644 index 0000000..f2d4c5f --- /dev/null +++ b/gnqa/data/study1/results/gemma_eval_general2.json @@ -0,0 +1,7 @@ +, +{ + "faithfulness": NaN, + "answer_relevancy": NaN, + "context_relevancy": 0.10522726586398572, + "context_utilization": NaN +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_1.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_1.json new file mode 100644 index 0000000..017d467 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.9, + "context_utilization": 0.9070781944697044, + "context_relevancy": 0.2509564217695168, + "answer_relevancy": 0.9766358986013376 +}, +{ + "faithfulness": 0.9, + "context_utilization": 0.9070781944697044, + "context_relevancy": 0.39381356462665973, + "answer_relevancy": 0.9825656372129992 +}, +{ + "faithfulness": 0.9, + "context_utilization": 0.9104451978368653, + "context_relevancy": 0.39381356462665973, + "answer_relevancy": 0.973147869814394 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_2.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_2.json new file mode 100644 index 0000000..16e0754 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.96, + "context_utilization": 0.999999999991935, + "context_relevancy": 0.135272921108742, + "answer_relevancy": 0.9479744529828181 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.999999999991935, + "context_relevancy": 0.135272921108742, + "answer_relevancy": 0.951711024285933 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.999999999991935, + "context_relevancy": 0.14987988628287136, + "answer_relevancy": 0.9541549710773409 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_3.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_3.json new file mode 100644 index 0000000..566613d --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_3.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.97675568021047, + "context_relevancy": 0.2259505726726024, + "answer_relevancy": 0.9448278057931704 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.97675568021047, + "context_relevancy": 0.21568920951760603, + "answer_relevancy": 0.9444115188658463 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.97675568021047, + "context_relevancy": 0.22922926119719259, + "answer_relevancy": 0.9444470134072755 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_4.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_4.json new file mode 100644 index 0000000..61632cf --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_aging_4.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.9375, + "context_utilization": 0.9456511261659628, + "context_relevancy": 0.19499540357020145, + "answer_relevancy": 0.9422926379891006 +}, +{ + "faithfulness": 0.9375, + "context_utilization": 0.9213036834852352, + "context_relevancy": 0.18966624996518577, + "answer_relevancy": 0.9493955674020345 +}, +{ + "faithfulness": 0.9375, + "context_utilization": 0.9213036834852352, + "context_relevancy": 0.19896857554658115, + "answer_relevancy": 0.9454532501945042 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_1.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_1.json new file mode 100644 index 0000000..63646cf --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.8533333333333333, + "context_utilization": 0.9438491717704647, + "context_relevancy": 0.20436440992383947, + "answer_relevancy": 0.957861571692806 +}, +{ + "faithfulness": 0.8355555555555556, + "context_utilization": 0.9438491717704647, + "context_relevancy": 0.2012874868469164, + "answer_relevancy": 0.9533191002746577 +}, +{ + "faithfulness": 0.8533333333333333, + "context_utilization": 0.9438491717704647, + "context_relevancy": 0.18389618249909034, + "answer_relevancy": 0.9498105973186146 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_2.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_2.json new file mode 100644 index 0000000..02fe10f --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.9583333333333334, + "context_utilization": 0.7194444444356269, + "context_relevancy": 0.45524315840105317, + "answer_relevancy": 0.9496830965502638 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.7220833333238528, + "context_relevancy": 0.3970421001999949, + "answer_relevancy": 0.947827635665291 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.7194444444356269, + "context_relevancy": 0.3941849573428521, + "answer_relevancy": 0.9388702679644993 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_3.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_3.json new file mode 100644 index 0000000..6566e51 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_3.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.9237332568786083, + "context_relevancy": 0.2418398640689662, + "answer_relevancy": 0.9914901338443677 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9237332568786083, + "context_relevancy": 0.2352516287748486, + "answer_relevancy": 0.9926324858517163 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9295047961859101, + "context_relevancy": 0.2352516287748486, + "answer_relevancy": 0.9942151664950669 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_4.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_4.json new file mode 100644 index 0000000..29e72c0 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_diabetes_4.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.96, + "context_utilization": 0.8382274392203959, + "context_relevancy": 0.21850226437090842, + "answer_relevancy": 0.9268774561175513 +}, +{ + "faithfulness": 0.96, + "context_utilization": 0.8289482840320825, + "context_relevancy": 0.21792356066720475, + "answer_relevancy": 0.9264507966486306 +}, +{ + "faithfulness": 0.96, + "context_utilization": 0.8382274392203959, + "context_relevancy": 0.22104856066720474, + "answer_relevancy": 0.9306530537050953 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_gn_1.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_gn_1.json new file mode 100644 index 0000000..25a71b0 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_gn_1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.19999999999900003, + "context_relevancy": 0.05, + "answer_relevancy": 0.1823656883581401 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.19999999999900003, + "context_relevancy": 0.05, + "answer_relevancy": 0.1823656883581401 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.19999999999900003, + "context_relevancy": 0.05, + "answer_relevancy": 0.1823656883581401 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_gn_3.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_gn_3.json new file mode 100644 index 0000000..580e854 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_gn_3.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.5999999999959664, + "context_relevancy": 0.22450090744101633, + "answer_relevancy": 0.562411241022707 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.5999999999959664, + "context_relevancy": 0.1687443284936479, + "answer_relevancy": 0.5643801560995779 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.5999999999959664, + "context_relevancy": 0.1687443284936479, + "answer_relevancy": 0.5617108358354678 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_gn_4.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_gn_4.json new file mode 100644 index 0000000..bcfc652 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_cs_gn_4.json @@ -0,0 +1,19 @@ +[ +{ + "faithfulness": 1.0, + "context_utilization": 0.19999999999882354, + "context_relevancy": 0.065625, + "answer_relevancy": 0.1834019127645967 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.1999999999988889, + "context_relevancy": 0.065625, + "answer_relevancy": 0.18443207660654864 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.19999999999882354, + "context_relevancy": 0.065625, + "answer_relevancy": 0.18442316533105405 +}] diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_1.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_1.json new file mode 100644 index 0000000..f719092 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.96, + "context_utilization": 0.9479350312277262, + "context_relevancy": 0.21303541253345637, + "answer_relevancy": 0.9224404704070004 +}, +{ + "faithfulness": 0.96, + "context_utilization": 0.9479350312277262, + "context_relevancy": 0.21303541253345637, + "answer_relevancy": 0.9204895776596349 +}, +{ + "faithfulness": 0.975, + "context_utilization": 0.9479350312277262, + "context_relevancy": 0.21303541253345637, + "answer_relevancy": 0.9233177482569399 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_2.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_2.json new file mode 100644 index 0000000..6539d02 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.9999999999917659, + "context_relevancy": 0.12455653962641092, + "answer_relevancy": 0.9215002061256425 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9999999999917659, + "context_relevancy": 0.11027082534069661, + "answer_relevancy": 0.9238905660966263 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9999999999917659, + "context_relevancy": 0.10345264352251479, + "answer_relevancy": 0.9236938936685843 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_3.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_3.json new file mode 100644 index 0000000..13c967f --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_3.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.9017950700460371, + "context_relevancy": 0.15025391166567637, + "answer_relevancy": 0.9080233205044008 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9017950700460371, + "context_relevancy": 0.1521235888294712, + "answer_relevancy": 0.9183172871520828 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9017950700460371, + "context_relevancy": 0.14271182412358882, + "answer_relevancy": 0.914051539296523 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_4.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_4.json new file mode 100644 index 0000000..b40e032 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_aging_4.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.873908075621365, + "context_relevancy": 0.13236286714496703, + "answer_relevancy": 0.9379656935564172 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.873908075621365, + "context_relevancy": 0.13236286714496703, + "answer_relevancy": 0.9291571366744364 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.873908075621365, + "context_relevancy": 0.13236286714496703, + "answer_relevancy": 0.9374908833538264 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_1.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_1.json new file mode 100644 index 0000000..d06530b --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_1.json @@ -0,0 +1,20 @@ +[ +{ + "faithfulness": 1.0, + "context_utilization": 0.9898660740877201, + "context_relevancy": 0.31265901349702185, + "answer_relevancy": 0.9236030246314068 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9898660740877201, + "context_relevancy": 0.14113303947104788, + "answer_relevancy": 0.9150252742414604 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9728819471034, + "context_relevancy": 0.13863303947104788, + "answer_relevancy": 0.9148789006153158 +} +] diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_2.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_2.json new file mode 100644 index 0000000..e9fee86 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_2.json @@ -0,0 +1,20 @@ +[ +{ + "faithfulness": 1.0, + "context_utilization": 0.7124087573371619, + "context_relevancy": 0.22621316914080075, + "answer_relevancy": 0.9046933431898141 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.7004998969667501, + "context_relevancy": 0.23871316914080074, + "answer_relevancy": 0.9058328551471282 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.7124087573371619, + "context_relevancy": 0.24675410481331536, + "answer_relevancy": 0.9079384840142384 +} +] diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_3.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_3.json new file mode 100644 index 0000000..e39107d --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_3.json @@ -0,0 +1,20 @@ +[ +{ + "faithfulness": 0.96, + "context_utilization": 0.7479011200345999, + "context_relevancy": 0.2814642730385713, + "answer_relevancy": 0.8930647394153285 +}, +{ + "faithfulness": 0.9099999999999999, + "context_utilization": 0.7479011200345999, + "context_relevancy": 0.2814642730385713, + "answer_relevancy": 0.896847471293901 +}, +{ + "faithfulness": 0.9099999999999999, + "context_utilization": 0.7479011200345999, + "context_relevancy": 0.2814642730385713, + "answer_relevancy": 0.8912330225043821 +} +] diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_4.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_4.json new file mode 100644 index 0000000..2be82a9 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_diabetes_4.json @@ -0,0 +1,20 @@ +[ +{ + "faithfulness": 0.9333333333333332, + "context_utilization": 0.7297725885164278, + "context_relevancy": 0.17196237023200656, + "answer_relevancy": 0.8650648136737542 +}, +{ + "faithfulness": 0.9333333333333332, + "context_utilization": 0.7297725885164278, + "context_relevancy": 0.19056702139479725, + "answer_relevancy": 0.877389474552466 +}, +{ + "faithfulness": 0.9333333333333332, + "context_utilization": 0.7297725885164278, + "context_relevancy": 0.12413628327548483, + "answer_relevancy": 0.8783898419790906 +} +] diff --git a/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_gn_3.json b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_gn_3.json new file mode 100644 index 0000000..8f33b47 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/gpt4o_eval_de_gn_3.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.3914232592779822, + "context_relevancy": 0.05517979452054794, + "answer_relevancy": 0.39015395726757396 +}, +{ + "faithfulness": 0.6666666666666666, + "context_utilization": 0.3914232592779822, + "context_relevancy": 0.05517979452054794, + "answer_relevancy": 0.3864361192318465 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.3914232592779822, + "context_relevancy": 0.05517979452054794, + "answer_relevancy": 0.3901540653386376 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/gpt4o/scores_cs_diabetes.json b/gnqa/data/study1/results/gpt4o/scores_cs_diabetes.json new file mode 100644 index 0000000..ef8c661 --- /dev/null +++ b/gnqa/data/study1/results/gpt4o/scores_cs_diabetes.json @@ -0,0 +1,37 @@ +, +{ + "faithfulness": 0.8836363636363636, + "context_utilization": 0.9533674463200074, + "context_relevancy": 0.1906017620560349, + "answer_relevancy": 0.9629314894517702 +}, +{ + "faithfulness": 0.8436363636363637, + "context_utilization": 0.9533674463200074, + "context_relevancy": 0.20364480596864404, + "answer_relevancy": 0.9495337378736439 +}, +{ + "faithfulness": 0.9292861989650555, + "context_utilization": 0.9651063978998563, + "context_relevancy": 0.7109415961877185, + "answer_relevancy": 0.6638464088279047 +}, +{ + "faithfulness": 0.4690747444442785, + "context_utilization": 0.7745118439410044, + "context_relevancy": 0.7140014395170777, + "answer_relevancy": 0.9322560108422944 +}, +{ + "faithfulness": 0.7745118439410044, + "context_utilization": 0.3333333333333333, + "context_relevancy": 0.3538011695906433, + "answer_relevancy": 0.5456168066603103 +}, +{ + "faithfulness": 0.5657894736779605, + "context_utilization": 1.0, + "context_relevancy": 0.22142857142857142, + "answer_relevancy": 0.7181594110215056 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_cs_aging_1.json b/gnqa/data/study1/results/human/scores_cs_aging_1.json new file mode 100644 index 0000000..f37296e --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_aging_1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.9428571428571428, + "context_utilization": 0.9352808378906239, + "context_relevancy": 0.07125660926343383, + "answer_relevancy": 0.9523107847972947 +}, +{ + "faithfulness": 0.9428571428571428, + "context_utilization": 0.9355754170487147, + "context_relevancy": 0.07125660926343383, + "answer_relevancy": 0.9549674105661919 +}, +{ + "faithfulness": 0.9428571428571428, + "context_utilization": 0.9211814776549062, + "context_relevancy": 0.07125660926343383, + "answer_relevancy": 0.9499741000488516 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_cs_aging_2.json b/gnqa/data/study1/results/human/scores_cs_aging_2.json new file mode 100644 index 0000000..f7dae45 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_aging_2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.7742138364779875, + "context_utilization": 0.9894163077459343, + "context_relevancy": 0.04506568948673187, + "answer_relevancy": 0.9408685212116719 +}, +{ + "faithfulness": 0.7742138364779875, + "context_utilization": 0.9894163077459343, + "context_relevancy": 0.04506568948673187, + "answer_relevancy": 0.9443348131121218 +}, +{ + "faithfulness": 0.7742138364779875, + "context_utilization": 0.9894163077459343, + "context_relevancy": 0.04506568948673187, + "answer_relevancy": 0.9373602976132769 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_cs_aging_3.json b/gnqa/data/study1/results/human/scores_cs_aging_3.json new file mode 100644 index 0000000..b844e70 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_aging_3.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.5714285714285715, + "context_utilization": 0.8007295763340471, + "context_relevancy": 0.17757604714126454, + "answer_relevancy": 0.9624406549445811 +}, +{ + "faithfulness": 0.5714285714285715, + "context_utilization": 0.8256406991618427, + "context_relevancy": 0.17757604714126454, + "answer_relevancy": 0.9624295953235836 +}, +{ + "faithfulness": 0.5714285714285715, + "context_utilization": 0.8256406991618427, + "context_relevancy": 0.17757604714126454, + "answer_relevancy": 0.9622154472101722 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_cs_diabetes_1.json b/gnqa/data/study1/results/human/scores_cs_diabetes_1.json new file mode 100644 index 0000000..8316988 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_diabetes_1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.875, + "context_utilization": 0.6983276538190184, + "context_relevancy": 0.12429532403609515, + "answer_relevancy": 0.9112620728936985 +}, +{ + "faithfulness": 0.875, + "context_utilization": 0.6983276538190184, + "context_relevancy": 0.09929532403609516, + "answer_relevancy": 0.9153897050102227 +}, +{ + "faithfulness": 0.875, + "context_utilization": 0.6983276538190184, + "context_relevancy": 0.10864315012305167, + "answer_relevancy": 0.917767867097622 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_cs_diabetes_2.json b/gnqa/data/study1/results/human/scores_cs_diabetes_2.json new file mode 100644 index 0000000..7020070 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_diabetes_2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.96, + "context_utilization": 0.9677256242806254, + "context_relevancy": 0.21125490196078428, + "answer_relevancy": 0.96903893567995 +}, +{ + "faithfulness": 0.96, + "context_utilization": 0.9769465411060386, + "context_relevancy": 0.2143799019607843, + "answer_relevancy": 0.9657737286038965 +}, +{ + "faithfulness": 0.96, + "context_utilization": 0.9769465411060386, + "context_relevancy": 0.2143799019607843, + "answer_relevancy": 0.9662487631948171 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_cs_diabetes_3.json b/gnqa/data/study1/results/human/scores_cs_diabetes_3.json new file mode 100644 index 0000000..1b57ac7 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_diabetes_3.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.8400000000000001, + "context_utilization": 0.9538081741417747, + "context_relevancy": 0.11497132693854006, + "answer_relevancy": 0.9169018406443659 +}, +{ + "faithfulness": 0.8400000000000001, + "context_utilization": 0.9538081741417747, + "context_relevancy": 0.2016379936052067, + "answer_relevancy": 0.9187380038134432 +}, +{ + "faithfulness": 0.8400000000000001, + "context_utilization": 0.9434457191364413, + "context_relevancy": 0.11497132693854006, + "answer_relevancy": 0.9169054522175759 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_cs_diabetes_4.json b/gnqa/data/study1/results/human/scores_cs_diabetes_4.json new file mode 100644 index 0000000..e54895e --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_diabetes_4.json @@ -0,0 +1,39 @@ +[ +{ + "faithfulness": 0.75, + "context_utilization": 0.49586940836114385, + "context_relevancy": 0.4489795918367347, + "answer_relevancy": 0.9050522628722737 +}, +{ + "faithfulness": 0.75, + "context_utilization": 0.5332560296769832, + "context_relevancy": 0.4489795918367347, + "answer_relevancy": 0.9274337314167257 +}, +{ + "faithfulness": 0.75, + "context_utilization": 0.49586940836114385, + "context_relevancy": 0.4489795918367347, + "answer_relevancy": 0.9274337314167257 +} +] +, +{ + "faithfulness": 0.75, + "context_utilization": 0.49586940836114385, + "context_relevancy": 0.2857142857142857, + "answer_relevancy": 0.9050522628722737 +}, +{ + "faithfulness": 0.75, + "context_utilization": 0.49586940836114385, + "context_relevancy": 0.4489795918367347, + "answer_relevancy": 0.9050692102679129 +}, +{ + "faithfulness": 0.75, + "context_utilization": 0.49586940836114385, + "context_relevancy": 0.4489795918367347, + "answer_relevancy": 0.9050522628722737 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_cs_gn_1.json b/gnqa/data/study1/results/human/scores_cs_gn_1.json new file mode 100644 index 0000000..4481bdb --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_gn_1.json @@ -0,0 +1,14 @@ +[ +{ + "faithfulness": 0.9099999999999999, + "context_utilization": 0.7636817432217684, + "context_relevancy": 0.1880278568582262, + "answer_relevancy": 0.9423280729066063 +}, +{ + "faithfulness": 0.9099999999999999, + "context_utilization": 0.7357044805156637, + "context_relevancy": 0.15469452352489288, + "answer_relevancy": 0.9486310766041234 +} +] diff --git a/gnqa/data/study1/results/human/scores_cs_gn_2.json b/gnqa/data/study1/results/human/scores_cs_gn_2.json new file mode 100644 index 0000000..f0733da --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_gn_2.json @@ -0,0 +1,20 @@ +[ +{ + "faithfulness": 1.0, + "context_utilization": 0.6326643990778912, + "context_relevancy": 0.1347400263302517, + "answer_relevancy": 0.8746783013952267 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.6683786847884866, + "context_relevancy": 0.1508690585883162, + "answer_relevancy": 0.8703116371547157 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.6326643990778912, + "context_relevancy": 0.1332248748151002, + "answer_relevancy": 0.8689393391315343 +} +] diff --git a/gnqa/data/study1/results/human/scores_cs_gn_3.json b/gnqa/data/study1/results/human/scores_cs_gn_3.json new file mode 100644 index 0000000..7258a04 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_gn_3.json @@ -0,0 +1,25 @@ +, +{ + "faithfulness": 0.5677966101694916, + "context_utilization": 0.4561270844811867, + "context_relevancy": 0.5560185185148071, + "answer_relevancy": 0.5052295687739448 +}, +{ + "faithfulness": 0.75, + "context_utilization": 0.5643129043087701, + "context_relevancy": 0.05599820060366845, + "answer_relevancy": 0.7414497144046052 +}, +{ + "faithfulness": 0.75, + "context_utilization": 0.5729415276879585, + "context_relevancy": 0.05599820060366845, + "answer_relevancy": 0.5544292034718707 +}, +{ + "faithfulness": 0.75, + "context_utilization": 0.5643129043087701, + "context_relevancy": 0.05599820060366845, + "answer_relevancy": 0.5571557447633533 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_cs_gn_4.json b/gnqa/data/study1/results/human/scores_cs_gn_4.json new file mode 100644 index 0000000..15b1eb4 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_gn_4.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.9428571428571428, + "context_utilization": 0.789441709521905, + "context_relevancy": 0.136784410468621, + "answer_relevancy": 0.8500389108331188 +}, +{ + "faithfulness": 0.9142857142857143, + "context_utilization": 0.7921665772467545, + "context_relevancy": 0.15115688010424852, + "answer_relevancy": 0.8317623611813637 +}, +{ + "faithfulness": 0.9142857142857143, + "context_utilization": 0.789441709521905, + "context_relevancy": 0.1713997950840056, + "answer_relevancy": 0.8295033051724321 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_cs_gn_5.json b/gnqa/data/study1/results/human/scores_cs_gn_5.json new file mode 100644 index 0000000..03713c2 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_gn_5.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.9333333333333332, + "context_utilization": 0.6801836614504664, + "context_relevancy": 0.06454107195486505, + "answer_relevancy": 0.7372449377189451 +}, +{ + "faithfulness": 0.888888888888889, + "context_utilization": 0.6582554717950728, + "context_relevancy": 0.06454107195486505, + "answer_relevancy": 0.7372493726798736 +}, +{ + "faithfulness": 0.8761904761904763, + "context_utilization": 0.6582554717950728, + "context_relevancy": 0.06454107195486505, + "answer_relevancy": 0.7372449377189451 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_cs_gn_6.json b/gnqa/data/study1/results/human/scores_cs_gn_6.json new file mode 100644 index 0000000..0d67e80 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_gn_6.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.75, + "context_utilization": 0.45564199508207504, + "context_relevancy": 0.06005275024001898, + "answer_relevancy": 0.8915679391851077 +}, +{ + "faithfulness": 0.75, + "context_utilization": 0.45564199508207504, + "context_relevancy": 0.05215801339791372, + "answer_relevancy": 0.7064299254450507 +}, +{ + "faithfulness": 0.75, + "context_utilization": 0.45564199508207504, + "context_relevancy": 0.0707670359543047, + "answer_relevancy": 0.705077643467664 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_cs_gn_7.json b/gnqa/data/study1/results/human/scores_cs_gn_7.json new file mode 100644 index 0000000..a30782a --- /dev/null +++ b/gnqa/data/study1/results/human/scores_cs_gn_7.json @@ -0,0 +1,18 @@ +{ + "faithfulness": 1.0, + "context_utilization": 0.9178474303338136, + "context_relevancy": 0.09082338152105594, + "answer_relevancy": 0.9524284122181226 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9178474303338136, + "context_relevancy": 0.09082338152105594, + "answer_relevancy": 0.9492709094955006 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9178474303338136, + "context_relevancy": 0.09082338152105594, + "answer_relevancy": 0.9524270517859097 +} diff --git a/gnqa/data/study1/results/human/scores_de_aging_1.json b/gnqa/data/study1/results/human/scores_de_aging_1.json new file mode 100644 index 0000000..0700cc3 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_de_aging_1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.7428571428571429, + "context_utilization": 0.811213861888054, + "context_relevancy": 0.2314977832798794, + "answer_relevancy": 0.9433409234117335 +}, +{ + "faithfulness": 0.7428571428571429, + "context_utilization": 0.7983208584270672, + "context_relevancy": 0.24114933391503665, + "answer_relevancy": 0.9213466964486724 +}, +{ + "faithfulness": 0.7142857142857142, + "context_utilization": 0.7928499698879043, + "context_relevancy": 0.25367860791972047, + "answer_relevancy": 0.9318615626710995 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_de_aging_2.json b/gnqa/data/study1/results/human/scores_de_aging_2.json new file mode 100644 index 0000000..b7f8cc0 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_de_aging_2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.9999999999923077, + "context_relevancy": 1.0, + "answer_relevancy": 0.8836732547434365 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9999999999923077, + "context_relevancy": 1.0, + "answer_relevancy": 0.8836732547434365 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9999999999923077, + "context_relevancy": 1.0, + "answer_relevancy": 0.8836732547434365 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_de_diabetes_1.1.json b/gnqa/data/study1/results/human/scores_de_diabetes_1.1.json new file mode 100644 index 0000000..0e46a7f --- /dev/null +++ b/gnqa/data/study1/results/human/scores_de_diabetes_1.1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.7777403152338384, + "context_relevancy": 0.06084656084656084, + "answer_relevancy": 0.9645121106959694 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.7777403152338384, + "context_relevancy": 0.06084656084656084, + "answer_relevancy": 0.9545089573441493 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.7719252969185456, + "context_relevancy": 0.05026455026455026, + "answer_relevancy": 0.9327156331092903 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_de_diabetes_1.json b/gnqa/data/study1/results/human/scores_de_diabetes_1.json new file mode 100644 index 0000000..0b621e2 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_de_diabetes_1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.9166666666666667, + "context_utilization": 0.7671392748688641, + "context_relevancy": 0.33561602418745273, + "answer_relevancy": 0.90324232280188 +}, +{ + "faithfulness": 0.9166666666666667, + "context_utilization": 0.8555804271901495, + "context_relevancy": 0.2314914450628736, + "answer_relevancy": 0.7214993293693964 +}, +{ + "faithfulness": 0.9666666666666668, + "context_utilization": 0.8080409996869443, + "context_relevancy": 0.2837641723356009, + "answer_relevancy": 0.9014349074286775 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_de_diabetes_2.json b/gnqa/data/study1/results/human/scores_de_diabetes_2.json new file mode 100644 index 0000000..bd6159a --- /dev/null +++ b/gnqa/data/study1/results/human/scores_de_diabetes_2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.96, + "context_utilization": 0.9407265478802447, + "context_relevancy": 0.36922494182022314, + "answer_relevancy": 0.9364702737085768 +}, +{ + "faithfulness": 0.96, + "context_utilization": 0.9344763371477345, + "context_relevancy": 0.386466321130568, + "answer_relevancy": 0.944903559928554 +}, +{ + "faithfulness": 0.96, + "context_utilization": 0.9344763371477345, + "context_relevancy": 0.36922494182022314, + "answer_relevancy": 0.9355512181399582 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_de_gn_1.1.json b/gnqa/data/study1/results/human/scores_de_gn_1.1.json new file mode 100644 index 0000000..d47c31f --- /dev/null +++ b/gnqa/data/study1/results/human/scores_de_gn_1.1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.9609375, + "context_utilization": 0.6937871661149843, + "context_relevancy": 0.13637360626722328, + "answer_relevancy": 0.7491735530216923 +}, +{ + "faithfulness": 0.9609375, + "context_utilization": 0.6937871661149843, + "context_relevancy": 0.13637360626722328, + "answer_relevancy": 0.8902254519253692 +}, +{ + "faithfulness": 0.9296875, + "context_utilization": 0.6937871661149843, + "context_relevancy": 0.13637360626722328, + "answer_relevancy": 0.7491716987687886 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_de_gn_1.json b/gnqa/data/study1/results/human/scores_de_gn_1.json new file mode 100644 index 0000000..9b8aea1 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_de_gn_1.json @@ -0,0 +1,18 @@ +{ + "faithfulness": 1.0, + "context_utilization": 0.9596645021564207, + "context_relevancy": 0.1634286630390054, + "answer_relevancy": 0.8973761639776056 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9596645021564207, + "context_relevancy": 0.1634286630390054, + "answer_relevancy": 0.9038434542970721 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9561079845997444, + "context_relevancy": 0.1634286630390054, + "answer_relevancy": 0.8983469111948426 +} diff --git a/gnqa/data/study1/results/human/scores_de_gn_2.json b/gnqa/data/study1/results/human/scores_de_gn_2.json new file mode 100644 index 0000000..30be099 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_de_gn_2.json @@ -0,0 +1,18 @@ +{ + "faithfulness": 0.8, + "context_utilization": 0.7266600180799679, + "context_relevancy": 0.12599664343008876, + "answer_relevancy": 0.7320068044307713 +}, +{ + "faithfulness": 0.8, + "context_utilization": 0.7266600180799679, + "context_relevancy": 0.13234584977929512, + "answer_relevancy": 0.7198147208663943 +}, +{ + "faithfulness": 0.8, + "context_utilization": 0.7266600180799679, + "context_relevancy": 0.12849969593314126, + "answer_relevancy": 0.7325464661134955 +} diff --git a/gnqa/data/study1/results/human/scores_de_gn_3.json b/gnqa/data/study1/results/human/scores_de_gn_3.json new file mode 100644 index 0000000..33a94ff --- /dev/null +++ b/gnqa/data/study1/results/human/scores_de_gn_3.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.8666666666666666, + "context_utilization": 0.6480859663109396, + "context_relevancy": 0.1510877797535341, + "answer_relevancy": 0.915240518467451 +}, +{ + "faithfulness": 0.8666666666666666, + "context_utilization": 0.6480859663109396, + "context_relevancy": 0.11387847742795269, + "answer_relevancy": 0.9124757388808369 +}, +{ + "faithfulness": 0.9333333333333332, + "context_utilization": 0.6480859663109396, + "context_relevancy": 0.1510877797535341, + "answer_relevancy": 0.9141762748312928 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_de_gn_4.json b/gnqa/data/study1/results/human/scores_de_gn_4.json new file mode 100644 index 0000000..345f566 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_de_gn_4.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.65, + "context_utilization": 0.354120538187183, + "context_relevancy": 0.1120026888642334, + "answer_relevancy": 0.7376780691990237 +}, +{ + "faithfulness": 0.5333333333333333, + "context_utilization": 0.34712053818788413, + "context_relevancy": 0.1120026888642334, + "answer_relevancy": 0.7455570356847625 +}, +{ + "faithfulness": 0.65, + "context_utilization": 0.34712053818788413, + "context_relevancy": 0.0993042761658207, + "answer_relevancy": 0.7376780609996703 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_de_gn_5.json b/gnqa/data/study1/results/human/scores_de_gn_5.json new file mode 100644 index 0000000..5148d68 --- /dev/null +++ b/gnqa/data/study1/results/human/scores_de_gn_5.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.8007395937295169, + "context_relevancy": 0.049944862903025335, + "answer_relevancy": 0.8599243307705603 +}, +{ + "faithfulness": 0.8, + "context_utilization": 0.806603791260579, + "context_relevancy": 0.049944862903025335, + "answer_relevancy": 0.6986715526356269 +}, +{ + "faithfulness": 0.9, + "context_utilization": 0.806603791260579, + "context_relevancy": 0.049944862903025335, + "answer_relevancy": 0.8579006890252776 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/human/scores_de_gn_6.json b/gnqa/data/study1/results/human/scores_de_gn_6.json new file mode 100644 index 0000000..25d04cf --- /dev/null +++ b/gnqa/data/study1/results/human/scores_de_gn_6.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.9999999999919545, + "context_relevancy": 0.20662768031189083, + "answer_relevancy": 0.9302858689849556 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9999999999919545, + "context_relevancy": 0.2584795321637427, + "answer_relevancy": 0.9258655139523131 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9999999999919545, + "context_relevancy": 0.1992202729044834, + "answer_relevancy": 0.9219977486705678 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/llamaeval_general1.json b/gnqa/data/study1/results/llamaeval_general1.json new file mode 100644 index 0000000..d9d134f --- /dev/null +++ b/gnqa/data/study1/results/llamaeval_general1.json @@ -0,0 +1,13 @@ +, +{ + "faithfulness": NaN, + "answer_relevancy": NaN, + "context_relevancy": 0.924645390070922, + "context_utilization": NaN +}, +{ + "faithfulness": NaN, + "answer_relevancy": NaN, + "context_relevancy": 0.8, + "context_utilization": NaN +} \ No newline at end of file diff --git a/gnqa/data/study1/results/results.json b/gnqa/data/study1/results/results.json new file mode 100644 index 0000000..4b30b95 --- /dev/null +++ b/gnqa/data/study1/results/results.json @@ -0,0 +1,20 @@ +{'faithfulness': nan, 'answer_relevancy': nan, 'context_relevancy': 0.7412, 'context_utilization': nan} +, +{ + "faithfulness": NaN, + "answer_relevancy": NaN, + "context_relevancy": 0.5342715544752126, + "context_utilization": NaN +}, +{ + "faithfulness": NaN, + "answer_relevancy": NaN, + "context_relevancy": 0.523524948140371, + "context_utilization": NaN +}, +{ + "faithfulness": NaN, + "answer_relevancy": NaN, + "context_relevancy": 0.6374515308316596, + "context_utilization": NaN +} \ No newline at end of file diff --git a/gnqa/data/study1/results/results_aging.json b/gnqa/data/study1/results/results_aging.json new file mode 100644 index 0000000..7fad8ff --- /dev/null +++ b/gnqa/data/study1/results/results_aging.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": NaN, + "answer_relevancy": NaN, + "context_relevancy": 0.726235827137375, + "context_utilization": NaN +}, +{ + "faithfulness": NaN, + "answer_relevancy": NaN, + "context_relevancy": 0.7121415843797659, + "context_utilization": NaN +}, +{ + "faithfulness": NaN, + "answer_relevancy": NaN, + "context_relevancy": 0.7374184453992012, + "context_utilization": NaN +} \ No newline at end of file diff --git a/gnqa/data/study1/results/test.json b/gnqa/data/study1/results/test.json new file mode 100644 index 0000000..c8fa2d4 --- /dev/null +++ b/gnqa/data/study1/results/test.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.75, + "answer_relevancy": 0.0, + "context_relevancy": 0.12244897959183673, + "context_utilization": 0.999999999990909 +}, +{ + "faithfulness": 0.75, + "answer_relevancy": 0.0, + "context_relevancy": 0.12244897959183673, + "context_utilization": 0.999999999990909 +}, +{ + "faithfulness": 0.75, + "answer_relevancy": 0.0, + "context_relevancy": 0.14285714285714285, + "context_utilization": 0.999999999990909 +} \ No newline at end of file diff --git a/gnqa/data/study1/results/test2.json b/gnqa/data/study1/results/test2.json new file mode 100644 index 0000000..9ae1d2d --- /dev/null +++ b/gnqa/data/study1/results/test2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "answer_relevancy": 0.982746184788807, + "context_relevancy": 0.09375, + "context_utilization": 0.99999999999 +}, +{ + "faithfulness": 0.9565217391304348, + "answer_relevancy": 0.982746184788807, + "context_relevancy": 0.09375, + "context_utilization": 0.99999999999 +}, +{ + "faithfulness": 0.9629629629629629, + "answer_relevancy": 0.9827409808824336, + "context_relevancy": 0.09375, + "context_utilization": 0.99999999999 +} \ No newline at end of file -- cgit v1.2.3