gardarjuto
commited on
Commit
•
aac8fa1
1
Parent(s):
d8d3ca7
initial commit
Browse files
app.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import matplotlib as mpl
|
3 |
+
|
4 |
+
from quiz import BenchmarkQuiz, BENCHMARKS, QuestionData
|
5 |
+
|
6 |
+
mpl.rcParams["figure.dpi"] = 300
|
7 |
+
|
8 |
+
quiz = BenchmarkQuiz()
|
9 |
+
|
10 |
+
def update_quiz_screen(question_data: QuestionData):
|
11 |
+
quiz_state = quiz.state
|
12 |
+
return {
|
13 |
+
quiz_screen: gr.update(visible=True),
|
14 |
+
question_number: gr.update(value=question_data.question_num),
|
15 |
+
question_text: gr.update(value=question_data.question),
|
16 |
+
answer_input: gr.update(
|
17 |
+
value=quiz_state.user_answers[quiz_state.current_question],
|
18 |
+
choices=question_data.options,
|
19 |
+
visible=BENCHMARKS[quiz_state.benchmark_name]["type"] == "multiple_choice",
|
20 |
+
label=question_data.instruction,
|
21 |
+
),
|
22 |
+
free_text_input: gr.update(
|
23 |
+
value=quiz_state.user_answers[quiz_state.current_question],
|
24 |
+
visible=BENCHMARKS[quiz_state.benchmark_name]["type"] == "free_text",
|
25 |
+
label=question_data.instruction,
|
26 |
+
),
|
27 |
+
next_button: gr.update(
|
28 |
+
value=question_data.next_button_text,
|
29 |
+
),
|
30 |
+
previous_button: gr.update(visible=question_data.previous_button_visibility),
|
31 |
+
}
|
32 |
+
|
33 |
+
|
34 |
+
def update_score_screen(plot):
|
35 |
+
return {
|
36 |
+
score_screen: gr.update(visible=True),
|
37 |
+
score_plot: gr.update(value=plot),
|
38 |
+
}
|
39 |
+
|
40 |
+
|
41 |
+
def start_quiz_handler(benchmark_name):
|
42 |
+
quiz.start_quiz(benchmark_name)
|
43 |
+
question_data = quiz.update_question()
|
44 |
+
return {
|
45 |
+
start_screen: gr.update(visible=False),
|
46 |
+
score_screen: gr.update(visible=False),
|
47 |
+
**update_quiz_screen(question_data),
|
48 |
+
}
|
49 |
+
|
50 |
+
|
51 |
+
def next_question_handler(answer_input, free_text_input):
|
52 |
+
answer = (
|
53 |
+
answer_input
|
54 |
+
if BENCHMARKS[quiz.state.benchmark_name]["type"] == "multiple_choice"
|
55 |
+
else free_text_input
|
56 |
+
)
|
57 |
+
result = quiz.next_question(answer)
|
58 |
+
if result["completed"]:
|
59 |
+
return {
|
60 |
+
quiz_screen: gr.update(visible=False),
|
61 |
+
**update_score_screen(result["plot"]),
|
62 |
+
}
|
63 |
+
else:
|
64 |
+
return update_quiz_screen(result["question_data"])
|
65 |
+
|
66 |
+
|
67 |
+
def previous_question_handler():
|
68 |
+
question_data = quiz.previous_question()
|
69 |
+
return update_quiz_screen(question_data)
|
70 |
+
|
71 |
+
|
72 |
+
def reset_quiz_handler():
|
73 |
+
return {
|
74 |
+
start_screen: gr.update(visible=True),
|
75 |
+
quiz_screen: gr.update(visible=False),
|
76 |
+
score_screen: gr.update(visible=False),
|
77 |
+
next_button: gr.update(visible=True),
|
78 |
+
}
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
83 |
+
start_screen = gr.Column(visible=True)
|
84 |
+
with start_screen:
|
85 |
+
gr.Markdown("# Veldu mælipróf")
|
86 |
+
benchmark_buttons = {
|
87 |
+
name: gr.Button(info["name"]) for name, info in BENCHMARKS.items()
|
88 |
+
}
|
89 |
+
|
90 |
+
quiz_screen = gr.Column(visible=False)
|
91 |
+
with quiz_screen:
|
92 |
+
question_number = gr.Markdown()
|
93 |
+
question_text = gr.Markdown()
|
94 |
+
answer_input = gr.Radio(choices=[], visible=False)
|
95 |
+
free_text_input = gr.Textbox(visible=False)
|
96 |
+
with gr.Row():
|
97 |
+
previous_button = gr.Button("Fyrri")
|
98 |
+
next_button = gr.Button("Næsta")
|
99 |
+
|
100 |
+
score_screen = gr.Column(visible=False)
|
101 |
+
with score_screen:
|
102 |
+
gr.Markdown(f"## Niðurstöður")
|
103 |
+
score_plot = gr.Plot()
|
104 |
+
reset_btn = gr.Button("Byrja upp á nýtt")
|
105 |
+
|
106 |
+
for benchmark_name, button in benchmark_buttons.items():
|
107 |
+
button.click(
|
108 |
+
fn=start_quiz_handler,
|
109 |
+
inputs=[gr.State(benchmark_name)],
|
110 |
+
outputs=[
|
111 |
+
start_screen,
|
112 |
+
quiz_screen,
|
113 |
+
score_screen,
|
114 |
+
question_number,
|
115 |
+
question_text,
|
116 |
+
answer_input,
|
117 |
+
free_text_input,
|
118 |
+
next_button,
|
119 |
+
previous_button,
|
120 |
+
],
|
121 |
+
)
|
122 |
+
|
123 |
+
next_button.click(
|
124 |
+
fn=next_question_handler,
|
125 |
+
inputs=[answer_input, free_text_input],
|
126 |
+
outputs=[
|
127 |
+
quiz_screen,
|
128 |
+
score_screen,
|
129 |
+
question_number,
|
130 |
+
question_text,
|
131 |
+
answer_input,
|
132 |
+
free_text_input,
|
133 |
+
next_button,
|
134 |
+
previous_button,
|
135 |
+
score_plot,
|
136 |
+
],
|
137 |
+
)
|
138 |
+
|
139 |
+
previous_button.click(
|
140 |
+
fn=previous_question_handler,
|
141 |
+
inputs=[],
|
142 |
+
outputs=[
|
143 |
+
quiz_screen,
|
144 |
+
question_number,
|
145 |
+
question_text,
|
146 |
+
answer_input,
|
147 |
+
free_text_input,
|
148 |
+
next_button,
|
149 |
+
previous_button,
|
150 |
+
],
|
151 |
+
)
|
152 |
+
|
153 |
+
reset_btn.click(
|
154 |
+
fn=reset_quiz_handler,
|
155 |
+
inputs=[],
|
156 |
+
outputs=[start_screen, quiz_screen, score_screen, next_button],
|
157 |
+
)
|
158 |
+
|
159 |
+
demo.launch()
|
quiz.py
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from hmac import new
|
2 |
+
from datasets import load_dataset
|
3 |
+
from dataclasses import dataclass
|
4 |
+
from typing import Any, Dict, List, Optional
|
5 |
+
import random
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
from score import calculate_gpt4o_score, BENCHMARK_SCORES
|
8 |
+
|
9 |
+
|
10 |
+
# Define benchmarks
|
11 |
+
BENCHMARKS = {
|
12 |
+
"icelandic-winogrande": {
|
13 |
+
"name": "Winogrande",
|
14 |
+
"path": "mideind/icelandic-winogrande",
|
15 |
+
"type": "multiple_choice",
|
16 |
+
},
|
17 |
+
"grammatical-error-detection": {
|
18 |
+
"name": "Málfræðivillur",
|
19 |
+
"path": "mideind/icelandic-sentences-gec",
|
20 |
+
"type": "multiple_choice",
|
21 |
+
},
|
22 |
+
"icelandic-inflection-all": {
|
23 |
+
"name": "Fallbeygingarpróf",
|
24 |
+
"path": "mideind/icelandic-inflection-all-flat",
|
25 |
+
"type": "free_text",
|
26 |
+
},
|
27 |
+
"icelandic-belebele": {
|
28 |
+
"name": "Belebele",
|
29 |
+
"path": "facebook/belebele",
|
30 |
+
"config_name": "isl_Latn",
|
31 |
+
"split": "test",
|
32 |
+
"type": "multiple_choice",
|
33 |
+
},
|
34 |
+
"icelandic-arc-challenge": {
|
35 |
+
"name": "ARC Challenge",
|
36 |
+
"path": "mideind/icelandic-arc-challenge",
|
37 |
+
"type": "multiple_choice",
|
38 |
+
},
|
39 |
+
"icelandic-wiki-qa": {
|
40 |
+
"name": "Wikipediapróf",
|
41 |
+
"path": "mideind/icelandic_wiki_qa",
|
42 |
+
"type": "free_text",
|
43 |
+
},
|
44 |
+
}
|
45 |
+
|
46 |
+
|
47 |
+
# Dataset specific preprocessing and standardization
|
48 |
+
def winogrande_preprocessing(sample):
|
49 |
+
new_sample = {}
|
50 |
+
new_sample["question"] = (
|
51 |
+
"Lestu eftirfarandi málsgrein:<p style='margin-left: 20px;'><i>{sentence}</i></p><br>Hvor valkostanna passar betur í eyðuna?".format(
|
52 |
+
sentence=sample["sentence"].replace("_", "________")
|
53 |
+
)
|
54 |
+
)
|
55 |
+
new_sample["options"] = sample["option1"], sample["option2"]
|
56 |
+
new_sample["answer"] = (
|
57 |
+
sample["option1"] if sample["answer"] == "1" else sample["option2"]
|
58 |
+
)
|
59 |
+
new_sample["instruction"] = "Valkostir"
|
60 |
+
return new_sample
|
61 |
+
|
62 |
+
|
63 |
+
def icelandic_sentence_gec_preprocessing(sample):
|
64 |
+
new_sample = {}
|
65 |
+
new_sample["question"] = (
|
66 |
+
f"Inniheldur eftirfarandi málsgrein villu?<p style='margin-left: 25px;'><i>{sample['sentence']}</i></p>"
|
67 |
+
)
|
68 |
+
new_sample["options"] = "Villa", "Engin villa"
|
69 |
+
new_sample["answer"] = "Engin villa" if sample["correct"] else "Villa"
|
70 |
+
new_sample["instruction"] = "Valkostir"
|
71 |
+
return new_sample
|
72 |
+
|
73 |
+
|
74 |
+
def inflection_all_preprocessing(sample):
|
75 |
+
new_sample = {}
|
76 |
+
case_map = {
|
77 |
+
"nf": "nefnifalli",
|
78 |
+
"þf": "þolfalli",
|
79 |
+
"þgf": "þágufalli",
|
80 |
+
"ef": "eignarfalli",
|
81 |
+
}
|
82 |
+
plurality_map = {"et": "eintölu", "ft": "fleirtölu"}
|
83 |
+
new_sample["question"] = (
|
84 |
+
f"Hvernig beygist <i>„{sample['noun_phrase']}“</i> í {case_map[sample['case']]} {plurality_map[sample['plurality']]}?"
|
85 |
+
)
|
86 |
+
new_sample["answer"] = sample["inflection"]
|
87 |
+
new_sample["instruction"] = "Skrifaðu réttu beyginguna."
|
88 |
+
return new_sample
|
89 |
+
|
90 |
+
|
91 |
+
def belebele_preprocessing(sample):
|
92 |
+
new_sample = {}
|
93 |
+
new_sample["question"] = (
|
94 |
+
f'Lestu eftirfarandi texta:<p style="margin-left: 25px;"><i>{sample["flores_passage"]}</i></p>\n\n{sample["question"]}'
|
95 |
+
)
|
96 |
+
new_sample["options"] = [
|
97 |
+
sample["mc_answer1"],
|
98 |
+
sample["mc_answer2"],
|
99 |
+
sample["mc_answer3"],
|
100 |
+
sample["mc_answer4"],
|
101 |
+
]
|
102 |
+
correct_idx = int(sample["correct_answer_num"]) - 1
|
103 |
+
new_sample["answer"] = new_sample["options"][correct_idx]
|
104 |
+
new_sample["instruction"] = "Veldu réttasta svarið."
|
105 |
+
return new_sample
|
106 |
+
|
107 |
+
|
108 |
+
def arc_challenge_preprocessing(sample):
|
109 |
+
new_sample = {}
|
110 |
+
new_sample["question"] = sample["question"]
|
111 |
+
new_sample["options"] = sample["choices"]["text"]
|
112 |
+
correct_idx = sample["choices"]["label"].index(sample["answerKey"])
|
113 |
+
new_sample["answer"] = sample["choices"]["text"][correct_idx]
|
114 |
+
new_sample["instruction"] = "Veldu réttasta svarið."
|
115 |
+
return new_sample
|
116 |
+
|
117 |
+
|
118 |
+
def wikipedia_preprocessing(sample):
|
119 |
+
new_sample = {}
|
120 |
+
new_sample["question"] = sample["query"]
|
121 |
+
new_sample["answer"] = sample["answer"]
|
122 |
+
new_sample["instruction"] = "Skrifaðu svarið þitt að neðan."
|
123 |
+
return new_sample
|
124 |
+
|
125 |
+
|
126 |
+
@dataclass
|
127 |
+
class QuizState:
|
128 |
+
benchmark_name: str
|
129 |
+
samples: List[Dict[str, Any]]
|
130 |
+
current_question: int
|
131 |
+
user_answers: List[Optional[str]]
|
132 |
+
correct_answers: List[str]
|
133 |
+
quiz_completed: bool
|
134 |
+
|
135 |
+
|
136 |
+
@dataclass
|
137 |
+
class QuestionData:
|
138 |
+
question_num: str
|
139 |
+
question: str
|
140 |
+
options: Optional[List[str]]
|
141 |
+
answer: Optional[str]
|
142 |
+
next_button_text: str
|
143 |
+
previous_button_visibility: bool
|
144 |
+
instruction: str = ""
|
145 |
+
|
146 |
+
|
147 |
+
class BenchmarkQuiz:
|
148 |
+
def __init__(self):
|
149 |
+
self.state = None
|
150 |
+
|
151 |
+
def start_quiz(self, benchmark_name: str) -> QuizState:
|
152 |
+
samples = self.load_benchmark(benchmark_name)
|
153 |
+
correct_answers = [sample["answer"] for sample in samples]
|
154 |
+
self.state = QuizState(
|
155 |
+
benchmark_name=benchmark_name,
|
156 |
+
samples=samples,
|
157 |
+
current_question=0,
|
158 |
+
user_answers=[None] * len(samples),
|
159 |
+
correct_answers=correct_answers,
|
160 |
+
quiz_completed=False,
|
161 |
+
)
|
162 |
+
return self.state
|
163 |
+
|
164 |
+
def load_benchmark(self, benchmark_name: str) -> List[Dict[str, Any]]:
|
165 |
+
dataset = load_dataset(
|
166 |
+
BENCHMARKS[benchmark_name]["path"],
|
167 |
+
name=BENCHMARKS[benchmark_name].get("config_name"),
|
168 |
+
split=BENCHMARKS[benchmark_name].get("split", "train"),
|
169 |
+
)
|
170 |
+
samples = random.sample(list(dataset), 5)
|
171 |
+
if benchmark_name == "icelandic-winogrande":
|
172 |
+
samples = [winogrande_preprocessing(sample) for sample in samples]
|
173 |
+
elif benchmark_name == "grammatical-error-detection":
|
174 |
+
samples = [
|
175 |
+
icelandic_sentence_gec_preprocessing(sample) for sample in samples
|
176 |
+
]
|
177 |
+
elif benchmark_name == "icelandic-inflection-all":
|
178 |
+
samples = [inflection_all_preprocessing(sample) for sample in samples]
|
179 |
+
elif benchmark_name == "icelandic-belebele":
|
180 |
+
samples = [belebele_preprocessing(sample) for sample in samples]
|
181 |
+
elif benchmark_name == "icelandic-arc-challenge":
|
182 |
+
samples = [arc_challenge_preprocessing(sample) for sample in samples]
|
183 |
+
elif benchmark_name == "icelandic-wiki-qa":
|
184 |
+
samples = [wikipedia_preprocessing(sample) for sample in samples]
|
185 |
+
return samples
|
186 |
+
|
187 |
+
def update_question(self) -> QuestionData:
|
188 |
+
"""
|
189 |
+
Update the question data based on the current state.
|
190 |
+
Is called when the user navigates to a new question.
|
191 |
+
"""
|
192 |
+
current_question = self.state.current_question
|
193 |
+
sample = self.state.samples[current_question]
|
194 |
+
|
195 |
+
question_num = (
|
196 |
+
f"### Spurning {current_question + 1} af {len(self.state.samples)}"
|
197 |
+
)
|
198 |
+
question = sample["question"]
|
199 |
+
options = sample.get("options")
|
200 |
+
answer = self.state.user_answers[current_question]
|
201 |
+
next_button_text = (
|
202 |
+
"Klára" if current_question == len(self.state.samples) - 1 else "Næsta"
|
203 |
+
)
|
204 |
+
previous_button_visibility = current_question > 0
|
205 |
+
instruction = sample.get("instruction", "")
|
206 |
+
|
207 |
+
return QuestionData(
|
208 |
+
question_num=question_num,
|
209 |
+
question=question,
|
210 |
+
options=options,
|
211 |
+
answer=answer,
|
212 |
+
next_button_text=next_button_text,
|
213 |
+
previous_button_visibility=previous_button_visibility,
|
214 |
+
instruction=instruction,
|
215 |
+
)
|
216 |
+
|
217 |
+
def next_question(self, answer: str) -> Dict[str, Any]:
|
218 |
+
"""
|
219 |
+
Update the state with the user's answer to the current question.
|
220 |
+
If the quiz is not completed, return the next question data.
|
221 |
+
If the quiz is completed, return the score plot.
|
222 |
+
Is called when the user submits an answer.
|
223 |
+
"""
|
224 |
+
self.state.user_answers[self.state.current_question] = answer
|
225 |
+
if self.state.current_question < len(self.state.samples) - 1:
|
226 |
+
self.state.current_question += 1
|
227 |
+
return {"completed": False, "question_data": self.update_question()}
|
228 |
+
else:
|
229 |
+
self.state.quiz_completed = True
|
230 |
+
user_score = self.calculate_score()
|
231 |
+
plot = self.plot_score(user_score)
|
232 |
+
return {"completed": True, "plot": plot}
|
233 |
+
|
234 |
+
def previous_question(self) -> QuestionData:
|
235 |
+
if self.state.current_question > 0:
|
236 |
+
self.state.current_question -= 1
|
237 |
+
return self.update_question()
|
238 |
+
|
239 |
+
def calculate_score(self) -> float:
|
240 |
+
if self.state.benchmark_name == "icelandic-wiki-qa":
|
241 |
+
queries = [sample["question"] for sample in self.state.samples]
|
242 |
+
return calculate_gpt4o_score(
|
243 |
+
queries, self.state.user_answers, self.state.correct_answers
|
244 |
+
)
|
245 |
+
|
246 |
+
score = sum(
|
247 |
+
user_answer == correct_answer
|
248 |
+
for user_answer, correct_answer in zip(
|
249 |
+
self.state.user_answers, self.state.correct_answers
|
250 |
+
)
|
251 |
+
)
|
252 |
+
return score / len(self.state.correct_answers)
|
253 |
+
|
254 |
+
def plot_score(self, user_score: float):
|
255 |
+
scores = {**BENCHMARK_SCORES[self.state.benchmark_name], "Þú": 100 * user_score}
|
256 |
+
# Sort by score
|
257 |
+
scores = dict(sorted(scores.items(), key=lambda item: item[1]))
|
258 |
+
|
259 |
+
# Define colors for user vs models
|
260 |
+
colors = {name: "tab:blue" for name in scores.keys()}
|
261 |
+
colors["Þú"] = "tab:green"
|
262 |
+
|
263 |
+
fig, ax = plt.subplots(figsize=(10, 6), dpi=250)
|
264 |
+
ax.spines[["left", "top", "right"]].set_visible(False)
|
265 |
+
|
266 |
+
ax.barh(
|
267 |
+
scores.keys(),
|
268 |
+
scores.values(),
|
269 |
+
height=0.6,
|
270 |
+
color=[colors[name] for name in scores.keys()],
|
271 |
+
)
|
272 |
+
ax.set_axisbelow(True)
|
273 |
+
ax.xaxis.grid(True, linestyle="--", alpha=0.6)
|
274 |
+
ax.set_title(f"{BENCHMARKS[self.state.benchmark_name]['name']}: Svona stóðstu þig miðað við mállíkönin", pad=20)
|
275 |
+
ax.set_xlabel("Stig (%)")
|
276 |
+
ax.set_xlim(0, 100)
|
277 |
+
plt.tight_layout()
|
278 |
+
return fig
|
score.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from openai import OpenAI
|
4 |
+
import re
|
5 |
+
|
6 |
+
# Model scores
|
7 |
+
BENCHMARK_SCORES = {
|
8 |
+
"icelandic-winogrande": {
|
9 |
+
"Claude 3.5 Sonnet": 90.4,
|
10 |
+
"GPT-4o": 85.4,
|
11 |
+
"GPT-4-turbo": 85.8,
|
12 |
+
"Hermes 3 Llama 3.1 405B fp8": 70.6,
|
13 |
+
"Claude 2.1": 55.1,
|
14 |
+
"GPT-3.5-turbo": 52.0,
|
15 |
+
},
|
16 |
+
"grammatical-error-detection": {
|
17 |
+
"Claude 3.5 Sonnet": 70.0,
|
18 |
+
"GPT-4o": 68.0,
|
19 |
+
"GPT-4-turbo": 60.5,
|
20 |
+
"Hermes 3 Llama 3.1 405B fp8": 53.5,
|
21 |
+
"Claude 2.1": 52.5,
|
22 |
+
"GPT-3.5-turbo": 52.0,
|
23 |
+
},
|
24 |
+
"icelandic-inflection-all": {
|
25 |
+
"Claude 3.5 Sonnet": 89.2,
|
26 |
+
"GPT-4o": 87.8,
|
27 |
+
"GPT-4-turbo": 76.6,
|
28 |
+
"Hermes 3 Llama 3.1 405B fp8": 61.8,
|
29 |
+
"Claude 2.1": 55.2,
|
30 |
+
"GPT-3.5-turbo": 39.1,
|
31 |
+
},
|
32 |
+
"belebele": {
|
33 |
+
"Claude 3.5 Sonnet": 92.0,
|
34 |
+
"GPT-4o": 90.4,
|
35 |
+
"GPT-4-turbo": 89.3,
|
36 |
+
"Hermes 3 Llama 3.1 405B fp8": 86.1,
|
37 |
+
"Claude 2.1": 42.1,
|
38 |
+
"GPT-3.5-turbo": 59.2,
|
39 |
+
},
|
40 |
+
"icelandic-arc-challenge": {
|
41 |
+
"Claude 3.5 Sonnet": 89.6,
|
42 |
+
"GPT-4o": 90.4,
|
43 |
+
"GPT-4-turbo": 88.7,
|
44 |
+
"Hermes 3 Llama 3.1 405B fp8": 72.0,
|
45 |
+
"Claude 2.1": 59.9,
|
46 |
+
"GPT-3.5-turbo": 49.5,
|
47 |
+
},
|
48 |
+
"icelandic-wiki-qa": {
|
49 |
+
"Claude 3.5 Sonnet": 44.7,
|
50 |
+
"GPT-4o": 38.0,
|
51 |
+
"GPT-4-turbo": 31.0,
|
52 |
+
"Hermes 3 Llama 3.1 405B fp8": 33.8,
|
53 |
+
"Claude 2.1": 21.1,
|
54 |
+
"GPT-3.5-turbo": 15.0,
|
55 |
+
},
|
56 |
+
}
|
57 |
+
|
58 |
+
|
59 |
+
client = OpenAI(
|
60 |
+
# This is the default and can be omitted
|
61 |
+
api_key=os.environ.get("OPENAI_API_KEY"),
|
62 |
+
)
|
63 |
+
|
64 |
+
|
65 |
+
def calculate_gpt4o_score(queries, user_answers, correct_answers):
|
66 |
+
"""
|
67 |
+
Calculate the score for the Icelandic Wiki QA benchmark.
|
68 |
+
"""
|
69 |
+
prompt = "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness. You will be given the question which was asked, a correct reference answer, and the assistant's answer. Begin your evaluation by briefly comparing the assistant's answer with the correct answer. Identify any mistakes. Be as objective as possible. Additional information beyond the reference answer's content should not be considered. If the assistant's answer is not in Icelandic but the reference answer is, you should rate the answer poorly. After providing your short explanation, you must rate the assistant's answer using the following scale: [[poor]]: Incorrect, off-topic or in a different language; [[fair]]: Partially aligns with the reference answer with some inaccuracies or irrelevant information; [[excellent]]: Accurate and relevant, matching the reference answer in content and language.\nProvide your rating strictly in this format: \"Rating: [[category]]\", for example: \"Rating: [[fair]]\".\n\n[Question]\n{query}\n\n[Start of Correct Answer]\n{answer}\n[End of Correct Answer]\n\n[Start of Assistant's Answer]\n{response}\n[End of Assistant's Answer]"
|
70 |
+
score_filter = re.compile(r"Rating: \[\[(.*?)\]\]")
|
71 |
+
scores = []
|
72 |
+
|
73 |
+
for query, u_answer, c_answer in zip(queries, user_answers, correct_answers):
|
74 |
+
chat_completion = client.chat.completions.create(
|
75 |
+
messages=[
|
76 |
+
{
|
77 |
+
"role": "user",
|
78 |
+
"content": prompt.format(
|
79 |
+
query=query, answer=c_answer, response=u_answer
|
80 |
+
),
|
81 |
+
}
|
82 |
+
],
|
83 |
+
model="gpt-4o",
|
84 |
+
max_completion_tokens=200,
|
85 |
+
)
|
86 |
+
chat = chat_completion.choices[0].message.content or ""
|
87 |
+
score = score_filter.search(chat).group(1).lower()
|
88 |
+
scores.append(score)
|
89 |
+
|
90 |
+
return sum(
|
91 |
+
1 if score == "excellent" else 0.5 if score == "fair" else 0 for score in scores
|
92 |
+
) / len(scores)
|