diff options
Diffstat (limited to 'gnqa/paper1_eval/src/data/results/gpt4o')
21 files changed, 421 insertions, 0 deletions
diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_aging_1.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_aging_1.json new file mode 100644 index 0000000..017d467 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_aging_1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.9, + "context_utilization": 0.9070781944697044, + "context_relevancy": 0.2509564217695168, + "answer_relevancy": 0.9766358986013376 +}, +{ + "faithfulness": 0.9, + "context_utilization": 0.9070781944697044, + "context_relevancy": 0.39381356462665973, + "answer_relevancy": 0.9825656372129992 +}, +{ + "faithfulness": 0.9, + "context_utilization": 0.9104451978368653, + "context_relevancy": 0.39381356462665973, + "answer_relevancy": 0.973147869814394 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_aging_2.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_aging_2.json new file mode 100644 index 0000000..16e0754 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_aging_2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.96, + "context_utilization": 0.999999999991935, + "context_relevancy": 0.135272921108742, + "answer_relevancy": 0.9479744529828181 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.999999999991935, + "context_relevancy": 0.135272921108742, + "answer_relevancy": 0.951711024285933 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.999999999991935, + "context_relevancy": 0.14987988628287136, + "answer_relevancy": 0.9541549710773409 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_aging_3.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_aging_3.json new file mode 100644 index 0000000..566613d --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_aging_3.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.97675568021047, + "context_relevancy": 0.2259505726726024, + "answer_relevancy": 0.9448278057931704 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.97675568021047, + "context_relevancy": 0.21568920951760603, + "answer_relevancy": 0.9444115188658463 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.97675568021047, + "context_relevancy": 0.22922926119719259, + "answer_relevancy": 0.9444470134072755 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_aging_4.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_aging_4.json new file mode 100644 index 0000000..61632cf --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_aging_4.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.9375, + "context_utilization": 0.9456511261659628, + "context_relevancy": 0.19499540357020145, + "answer_relevancy": 0.9422926379891006 +}, +{ + "faithfulness": 0.9375, + "context_utilization": 0.9213036834852352, + "context_relevancy": 0.18966624996518577, + "answer_relevancy": 0.9493955674020345 +}, +{ + "faithfulness": 0.9375, + "context_utilization": 0.9213036834852352, + "context_relevancy": 0.19896857554658115, + "answer_relevancy": 0.9454532501945042 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_diabetes_1.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_diabetes_1.json new file mode 100644 index 0000000..63646cf --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_diabetes_1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.8533333333333333, + "context_utilization": 0.9438491717704647, + "context_relevancy": 0.20436440992383947, + "answer_relevancy": 0.957861571692806 +}, +{ + "faithfulness": 0.8355555555555556, + "context_utilization": 0.9438491717704647, + "context_relevancy": 0.2012874868469164, + "answer_relevancy": 0.9533191002746577 +}, +{ + "faithfulness": 0.8533333333333333, + "context_utilization": 0.9438491717704647, + "context_relevancy": 0.18389618249909034, + "answer_relevancy": 0.9498105973186146 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_diabetes_2.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_diabetes_2.json new file mode 100644 index 0000000..02fe10f --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_diabetes_2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.9583333333333334, + "context_utilization": 0.7194444444356269, + "context_relevancy": 0.45524315840105317, + "answer_relevancy": 0.9496830965502638 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.7220833333238528, + "context_relevancy": 0.3970421001999949, + "answer_relevancy": 0.947827635665291 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.7194444444356269, + "context_relevancy": 0.3941849573428521, + "answer_relevancy": 0.9388702679644993 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_diabetes_3.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_diabetes_3.json new file mode 100644 index 0000000..6566e51 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_diabetes_3.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.9237332568786083, + "context_relevancy": 0.2418398640689662, + "answer_relevancy": 0.9914901338443677 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9237332568786083, + "context_relevancy": 0.2352516287748486, + "answer_relevancy": 0.9926324858517163 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9295047961859101, + "context_relevancy": 0.2352516287748486, + "answer_relevancy": 0.9942151664950669 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_diabetes_4.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_diabetes_4.json new file mode 100644 index 0000000..29e72c0 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_diabetes_4.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.96, + "context_utilization": 0.8382274392203959, + "context_relevancy": 0.21850226437090842, + "answer_relevancy": 0.9268774561175513 +}, +{ + "faithfulness": 0.96, + "context_utilization": 0.8289482840320825, + "context_relevancy": 0.21792356066720475, + "answer_relevancy": 0.9264507966486306 +}, +{ + "faithfulness": 0.96, + "context_utilization": 0.8382274392203959, + "context_relevancy": 0.22104856066720474, + "answer_relevancy": 0.9306530537050953 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_gn_1.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_gn_1.json new file mode 100644 index 0000000..25a71b0 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_gn_1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.19999999999900003, + "context_relevancy": 0.05, + "answer_relevancy": 0.1823656883581401 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.19999999999900003, + "context_relevancy": 0.05, + "answer_relevancy": 0.1823656883581401 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.19999999999900003, + "context_relevancy": 0.05, + "answer_relevancy": 0.1823656883581401 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_gn_3.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_gn_3.json new file mode 100644 index 0000000..580e854 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_gn_3.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.5999999999959664, + "context_relevancy": 0.22450090744101633, + "answer_relevancy": 0.562411241022707 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.5999999999959664, + "context_relevancy": 0.1687443284936479, + "answer_relevancy": 0.5643801560995779 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.5999999999959664, + "context_relevancy": 0.1687443284936479, + "answer_relevancy": 0.5617108358354678 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_gn_4.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_gn_4.json new file mode 100644 index 0000000..bcfc652 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_cs_gn_4.json @@ -0,0 +1,19 @@ +[ +{ + "faithfulness": 1.0, + "context_utilization": 0.19999999999882354, + "context_relevancy": 0.065625, + "answer_relevancy": 0.1834019127645967 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.1999999999988889, + "context_relevancy": 0.065625, + "answer_relevancy": 0.18443207660654864 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.19999999999882354, + "context_relevancy": 0.065625, + "answer_relevancy": 0.18442316533105405 +}] diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_aging_1.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_aging_1.json new file mode 100644 index 0000000..f719092 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_aging_1.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 0.96, + "context_utilization": 0.9479350312277262, + "context_relevancy": 0.21303541253345637, + "answer_relevancy": 0.9224404704070004 +}, +{ + "faithfulness": 0.96, + "context_utilization": 0.9479350312277262, + "context_relevancy": 0.21303541253345637, + "answer_relevancy": 0.9204895776596349 +}, +{ + "faithfulness": 0.975, + "context_utilization": 0.9479350312277262, + "context_relevancy": 0.21303541253345637, + "answer_relevancy": 0.9233177482569399 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_aging_2.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_aging_2.json new file mode 100644 index 0000000..6539d02 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_aging_2.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.9999999999917659, + "context_relevancy": 0.12455653962641092, + "answer_relevancy": 0.9215002061256425 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9999999999917659, + "context_relevancy": 0.11027082534069661, + "answer_relevancy": 0.9238905660966263 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9999999999917659, + "context_relevancy": 0.10345264352251479, + "answer_relevancy": 0.9236938936685843 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_aging_3.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_aging_3.json new file mode 100644 index 0000000..13c967f --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_aging_3.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.9017950700460371, + "context_relevancy": 0.15025391166567637, + "answer_relevancy": 0.9080233205044008 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9017950700460371, + "context_relevancy": 0.1521235888294712, + "answer_relevancy": 0.9183172871520828 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9017950700460371, + "context_relevancy": 0.14271182412358882, + "answer_relevancy": 0.914051539296523 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_aging_4.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_aging_4.json new file mode 100644 index 0000000..b40e032 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_aging_4.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.873908075621365, + "context_relevancy": 0.13236286714496703, + "answer_relevancy": 0.9379656935564172 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.873908075621365, + "context_relevancy": 0.13236286714496703, + "answer_relevancy": 0.9291571366744364 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.873908075621365, + "context_relevancy": 0.13236286714496703, + "answer_relevancy": 0.9374908833538264 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_diabetes_1.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_diabetes_1.json new file mode 100644 index 0000000..d06530b --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_diabetes_1.json @@ -0,0 +1,20 @@ +[ +{ + "faithfulness": 1.0, + "context_utilization": 0.9898660740877201, + "context_relevancy": 0.31265901349702185, + "answer_relevancy": 0.9236030246314068 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9898660740877201, + "context_relevancy": 0.14113303947104788, + "answer_relevancy": 0.9150252742414604 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.9728819471034, + "context_relevancy": 0.13863303947104788, + "answer_relevancy": 0.9148789006153158 +} +] diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_diabetes_2.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_diabetes_2.json new file mode 100644 index 0000000..e9fee86 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_diabetes_2.json @@ -0,0 +1,20 @@ +[ +{ + "faithfulness": 1.0, + "context_utilization": 0.7124087573371619, + "context_relevancy": 0.22621316914080075, + "answer_relevancy": 0.9046933431898141 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.7004998969667501, + "context_relevancy": 0.23871316914080074, + "answer_relevancy": 0.9058328551471282 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.7124087573371619, + "context_relevancy": 0.24675410481331536, + "answer_relevancy": 0.9079384840142384 +} +] diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_diabetes_3.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_diabetes_3.json new file mode 100644 index 0000000..e39107d --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_diabetes_3.json @@ -0,0 +1,20 @@ +[ +{ + "faithfulness": 0.96, + "context_utilization": 0.7479011200345999, + "context_relevancy": 0.2814642730385713, + "answer_relevancy": 0.8930647394153285 +}, +{ + "faithfulness": 0.9099999999999999, + "context_utilization": 0.7479011200345999, + "context_relevancy": 0.2814642730385713, + "answer_relevancy": 0.896847471293901 +}, +{ + "faithfulness": 0.9099999999999999, + "context_utilization": 0.7479011200345999, + "context_relevancy": 0.2814642730385713, + "answer_relevancy": 0.8912330225043821 +} +] diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_diabetes_4.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_diabetes_4.json new file mode 100644 index 0000000..2be82a9 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_diabetes_4.json @@ -0,0 +1,20 @@ +[ +{ + "faithfulness": 0.9333333333333332, + "context_utilization": 0.7297725885164278, + "context_relevancy": 0.17196237023200656, + "answer_relevancy": 0.8650648136737542 +}, +{ + "faithfulness": 0.9333333333333332, + "context_utilization": 0.7297725885164278, + "context_relevancy": 0.19056702139479725, + "answer_relevancy": 0.877389474552466 +}, +{ + "faithfulness": 0.9333333333333332, + "context_utilization": 0.7297725885164278, + "context_relevancy": 0.12413628327548483, + "answer_relevancy": 0.8783898419790906 +} +] diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_gn_3.json b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_gn_3.json new file mode 100644 index 0000000..8f33b47 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/gpt4o_eval_de_gn_3.json @@ -0,0 +1,19 @@ +, +{ + "faithfulness": 1.0, + "context_utilization": 0.3914232592779822, + "context_relevancy": 0.05517979452054794, + "answer_relevancy": 0.39015395726757396 +}, +{ + "faithfulness": 0.6666666666666666, + "context_utilization": 0.3914232592779822, + "context_relevancy": 0.05517979452054794, + "answer_relevancy": 0.3864361192318465 +}, +{ + "faithfulness": 1.0, + "context_utilization": 0.3914232592779822, + "context_relevancy": 0.05517979452054794, + "answer_relevancy": 0.3901540653386376 +}
\ No newline at end of file diff --git a/gnqa/paper1_eval/src/data/results/gpt4o/scores_cs_diabetes.json b/gnqa/paper1_eval/src/data/results/gpt4o/scores_cs_diabetes.json new file mode 100644 index 0000000..ef8c661 --- /dev/null +++ b/gnqa/paper1_eval/src/data/results/gpt4o/scores_cs_diabetes.json @@ -0,0 +1,37 @@ +, +{ + "faithfulness": 0.8836363636363636, + "context_utilization": 0.9533674463200074, + "context_relevancy": 0.1906017620560349, + "answer_relevancy": 0.9629314894517702 +}, +{ + "faithfulness": 0.8436363636363637, + "context_utilization": 0.9533674463200074, + "context_relevancy": 0.20364480596864404, + "answer_relevancy": 0.9495337378736439 +}, +{ + "faithfulness": 0.9292861989650555, + "context_utilization": 0.9651063978998563, + "context_relevancy": 0.7109415961877185, + "answer_relevancy": 0.6638464088279047 +}, +{ + "faithfulness": 0.4690747444442785, + "context_utilization": 0.7745118439410044, + "context_relevancy": 0.7140014395170777, + "answer_relevancy": 0.9322560108422944 +}, +{ + "faithfulness": 0.7745118439410044, + "context_utilization": 0.3333333333333333, + "context_relevancy": 0.3538011695906433, + "answer_relevancy": 0.5456168066603103 +}, +{ + "faithfulness": 0.5657894736779605, + "context_utilization": 1.0, + "context_relevancy": 0.22142857142857142, + "answer_relevancy": 0.7181594110215056 +}
\ No newline at end of file |