gardarjuto commited on
Commit
aac8fa1
1 Parent(s): d8d3ca7

initial commit

Browse files
Files changed (3) hide show
  1. app.py +159 -0
  2. quiz.py +278 -0
  3. score.py +92 -0
app.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import matplotlib as mpl
3
+
4
+ from quiz import BenchmarkQuiz, BENCHMARKS, QuestionData
5
+
6
+ mpl.rcParams["figure.dpi"] = 300
7
+
8
+ quiz = BenchmarkQuiz()
9
+
10
+ def update_quiz_screen(question_data: QuestionData):
11
+ quiz_state = quiz.state
12
+ return {
13
+ quiz_screen: gr.update(visible=True),
14
+ question_number: gr.update(value=question_data.question_num),
15
+ question_text: gr.update(value=question_data.question),
16
+ answer_input: gr.update(
17
+ value=quiz_state.user_answers[quiz_state.current_question],
18
+ choices=question_data.options,
19
+ visible=BENCHMARKS[quiz_state.benchmark_name]["type"] == "multiple_choice",
20
+ label=question_data.instruction,
21
+ ),
22
+ free_text_input: gr.update(
23
+ value=quiz_state.user_answers[quiz_state.current_question],
24
+ visible=BENCHMARKS[quiz_state.benchmark_name]["type"] == "free_text",
25
+ label=question_data.instruction,
26
+ ),
27
+ next_button: gr.update(
28
+ value=question_data.next_button_text,
29
+ ),
30
+ previous_button: gr.update(visible=question_data.previous_button_visibility),
31
+ }
32
+
33
+
34
+ def update_score_screen(plot):
35
+ return {
36
+ score_screen: gr.update(visible=True),
37
+ score_plot: gr.update(value=plot),
38
+ }
39
+
40
+
41
+ def start_quiz_handler(benchmark_name):
42
+ quiz.start_quiz(benchmark_name)
43
+ question_data = quiz.update_question()
44
+ return {
45
+ start_screen: gr.update(visible=False),
46
+ score_screen: gr.update(visible=False),
47
+ **update_quiz_screen(question_data),
48
+ }
49
+
50
+
51
+ def next_question_handler(answer_input, free_text_input):
52
+ answer = (
53
+ answer_input
54
+ if BENCHMARKS[quiz.state.benchmark_name]["type"] == "multiple_choice"
55
+ else free_text_input
56
+ )
57
+ result = quiz.next_question(answer)
58
+ if result["completed"]:
59
+ return {
60
+ quiz_screen: gr.update(visible=False),
61
+ **update_score_screen(result["plot"]),
62
+ }
63
+ else:
64
+ return update_quiz_screen(result["question_data"])
65
+
66
+
67
+ def previous_question_handler():
68
+ question_data = quiz.previous_question()
69
+ return update_quiz_screen(question_data)
70
+
71
+
72
+ def reset_quiz_handler():
73
+ return {
74
+ start_screen: gr.update(visible=True),
75
+ quiz_screen: gr.update(visible=False),
76
+ score_screen: gr.update(visible=False),
77
+ next_button: gr.update(visible=True),
78
+ }
79
+
80
+
81
+
82
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
83
+ start_screen = gr.Column(visible=True)
84
+ with start_screen:
85
+ gr.Markdown("# Veldu mælipróf")
86
+ benchmark_buttons = {
87
+ name: gr.Button(info["name"]) for name, info in BENCHMARKS.items()
88
+ }
89
+
90
+ quiz_screen = gr.Column(visible=False)
91
+ with quiz_screen:
92
+ question_number = gr.Markdown()
93
+ question_text = gr.Markdown()
94
+ answer_input = gr.Radio(choices=[], visible=False)
95
+ free_text_input = gr.Textbox(visible=False)
96
+ with gr.Row():
97
+ previous_button = gr.Button("Fyrri")
98
+ next_button = gr.Button("Næsta")
99
+
100
+ score_screen = gr.Column(visible=False)
101
+ with score_screen:
102
+ gr.Markdown(f"## Niðurstöður")
103
+ score_plot = gr.Plot()
104
+ reset_btn = gr.Button("Byrja upp á nýtt")
105
+
106
+ for benchmark_name, button in benchmark_buttons.items():
107
+ button.click(
108
+ fn=start_quiz_handler,
109
+ inputs=[gr.State(benchmark_name)],
110
+ outputs=[
111
+ start_screen,
112
+ quiz_screen,
113
+ score_screen,
114
+ question_number,
115
+ question_text,
116
+ answer_input,
117
+ free_text_input,
118
+ next_button,
119
+ previous_button,
120
+ ],
121
+ )
122
+
123
+ next_button.click(
124
+ fn=next_question_handler,
125
+ inputs=[answer_input, free_text_input],
126
+ outputs=[
127
+ quiz_screen,
128
+ score_screen,
129
+ question_number,
130
+ question_text,
131
+ answer_input,
132
+ free_text_input,
133
+ next_button,
134
+ previous_button,
135
+ score_plot,
136
+ ],
137
+ )
138
+
139
+ previous_button.click(
140
+ fn=previous_question_handler,
141
+ inputs=[],
142
+ outputs=[
143
+ quiz_screen,
144
+ question_number,
145
+ question_text,
146
+ answer_input,
147
+ free_text_input,
148
+ next_button,
149
+ previous_button,
150
+ ],
151
+ )
152
+
153
+ reset_btn.click(
154
+ fn=reset_quiz_handler,
155
+ inputs=[],
156
+ outputs=[start_screen, quiz_screen, score_screen, next_button],
157
+ )
158
+
159
+ demo.launch()
quiz.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from hmac import new
2
+ from datasets import load_dataset
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, List, Optional
5
+ import random
6
+ import matplotlib.pyplot as plt
7
+ from score import calculate_gpt4o_score, BENCHMARK_SCORES
8
+
9
+
10
+ # Define benchmarks
11
+ BENCHMARKS = {
12
+ "icelandic-winogrande": {
13
+ "name": "Winogrande",
14
+ "path": "mideind/icelandic-winogrande",
15
+ "type": "multiple_choice",
16
+ },
17
+ "grammatical-error-detection": {
18
+ "name": "Málfræðivillur",
19
+ "path": "mideind/icelandic-sentences-gec",
20
+ "type": "multiple_choice",
21
+ },
22
+ "icelandic-inflection-all": {
23
+ "name": "Fallbeygingarpróf",
24
+ "path": "mideind/icelandic-inflection-all-flat",
25
+ "type": "free_text",
26
+ },
27
+ "icelandic-belebele": {
28
+ "name": "Belebele",
29
+ "path": "facebook/belebele",
30
+ "config_name": "isl_Latn",
31
+ "split": "test",
32
+ "type": "multiple_choice",
33
+ },
34
+ "icelandic-arc-challenge": {
35
+ "name": "ARC Challenge",
36
+ "path": "mideind/icelandic-arc-challenge",
37
+ "type": "multiple_choice",
38
+ },
39
+ "icelandic-wiki-qa": {
40
+ "name": "Wikipediapróf",
41
+ "path": "mideind/icelandic_wiki_qa",
42
+ "type": "free_text",
43
+ },
44
+ }
45
+
46
+
47
+ # Dataset specific preprocessing and standardization
48
+ def winogrande_preprocessing(sample):
49
+ new_sample = {}
50
+ new_sample["question"] = (
51
+ "Lestu eftirfarandi málsgrein:<p style='margin-left: 20px;'><i>{sentence}</i></p><br>Hvor valkostanna passar betur í eyðuna?".format(
52
+ sentence=sample["sentence"].replace("_", "________")
53
+ )
54
+ )
55
+ new_sample["options"] = sample["option1"], sample["option2"]
56
+ new_sample["answer"] = (
57
+ sample["option1"] if sample["answer"] == "1" else sample["option2"]
58
+ )
59
+ new_sample["instruction"] = "Valkostir"
60
+ return new_sample
61
+
62
+
63
+ def icelandic_sentence_gec_preprocessing(sample):
64
+ new_sample = {}
65
+ new_sample["question"] = (
66
+ f"Inniheldur eftirfarandi málsgrein villu?<p style='margin-left: 25px;'><i>{sample['sentence']}</i></p>"
67
+ )
68
+ new_sample["options"] = "Villa", "Engin villa"
69
+ new_sample["answer"] = "Engin villa" if sample["correct"] else "Villa"
70
+ new_sample["instruction"] = "Valkostir"
71
+ return new_sample
72
+
73
+
74
+ def inflection_all_preprocessing(sample):
75
+ new_sample = {}
76
+ case_map = {
77
+ "nf": "nefnifalli",
78
+ "þf": "þolfalli",
79
+ "þgf": "þágufalli",
80
+ "ef": "eignarfalli",
81
+ }
82
+ plurality_map = {"et": "eintölu", "ft": "fleirtölu"}
83
+ new_sample["question"] = (
84
+ f"Hvernig beygist <i>„{sample['noun_phrase']}“</i> í {case_map[sample['case']]} {plurality_map[sample['plurality']]}?"
85
+ )
86
+ new_sample["answer"] = sample["inflection"]
87
+ new_sample["instruction"] = "Skrifaðu réttu beyginguna."
88
+ return new_sample
89
+
90
+
91
+ def belebele_preprocessing(sample):
92
+ new_sample = {}
93
+ new_sample["question"] = (
94
+ f'Lestu eftirfarandi texta:<p style="margin-left: 25px;"><i>{sample["flores_passage"]}</i></p>\n\n{sample["question"]}'
95
+ )
96
+ new_sample["options"] = [
97
+ sample["mc_answer1"],
98
+ sample["mc_answer2"],
99
+ sample["mc_answer3"],
100
+ sample["mc_answer4"],
101
+ ]
102
+ correct_idx = int(sample["correct_answer_num"]) - 1
103
+ new_sample["answer"] = new_sample["options"][correct_idx]
104
+ new_sample["instruction"] = "Veldu réttasta svarið."
105
+ return new_sample
106
+
107
+
108
+ def arc_challenge_preprocessing(sample):
109
+ new_sample = {}
110
+ new_sample["question"] = sample["question"]
111
+ new_sample["options"] = sample["choices"]["text"]
112
+ correct_idx = sample["choices"]["label"].index(sample["answerKey"])
113
+ new_sample["answer"] = sample["choices"]["text"][correct_idx]
114
+ new_sample["instruction"] = "Veldu réttasta svarið."
115
+ return new_sample
116
+
117
+
118
+ def wikipedia_preprocessing(sample):
119
+ new_sample = {}
120
+ new_sample["question"] = sample["query"]
121
+ new_sample["answer"] = sample["answer"]
122
+ new_sample["instruction"] = "Skrifaðu svarið þitt að neðan."
123
+ return new_sample
124
+
125
+
126
+ @dataclass
127
+ class QuizState:
128
+ benchmark_name: str
129
+ samples: List[Dict[str, Any]]
130
+ current_question: int
131
+ user_answers: List[Optional[str]]
132
+ correct_answers: List[str]
133
+ quiz_completed: bool
134
+
135
+
136
+ @dataclass
137
+ class QuestionData:
138
+ question_num: str
139
+ question: str
140
+ options: Optional[List[str]]
141
+ answer: Optional[str]
142
+ next_button_text: str
143
+ previous_button_visibility: bool
144
+ instruction: str = ""
145
+
146
+
147
+ class BenchmarkQuiz:
148
+ def __init__(self):
149
+ self.state = None
150
+
151
+ def start_quiz(self, benchmark_name: str) -> QuizState:
152
+ samples = self.load_benchmark(benchmark_name)
153
+ correct_answers = [sample["answer"] for sample in samples]
154
+ self.state = QuizState(
155
+ benchmark_name=benchmark_name,
156
+ samples=samples,
157
+ current_question=0,
158
+ user_answers=[None] * len(samples),
159
+ correct_answers=correct_answers,
160
+ quiz_completed=False,
161
+ )
162
+ return self.state
163
+
164
+ def load_benchmark(self, benchmark_name: str) -> List[Dict[str, Any]]:
165
+ dataset = load_dataset(
166
+ BENCHMARKS[benchmark_name]["path"],
167
+ name=BENCHMARKS[benchmark_name].get("config_name"),
168
+ split=BENCHMARKS[benchmark_name].get("split", "train"),
169
+ )
170
+ samples = random.sample(list(dataset), 5)
171
+ if benchmark_name == "icelandic-winogrande":
172
+ samples = [winogrande_preprocessing(sample) for sample in samples]
173
+ elif benchmark_name == "grammatical-error-detection":
174
+ samples = [
175
+ icelandic_sentence_gec_preprocessing(sample) for sample in samples
176
+ ]
177
+ elif benchmark_name == "icelandic-inflection-all":
178
+ samples = [inflection_all_preprocessing(sample) for sample in samples]
179
+ elif benchmark_name == "icelandic-belebele":
180
+ samples = [belebele_preprocessing(sample) for sample in samples]
181
+ elif benchmark_name == "icelandic-arc-challenge":
182
+ samples = [arc_challenge_preprocessing(sample) for sample in samples]
183
+ elif benchmark_name == "icelandic-wiki-qa":
184
+ samples = [wikipedia_preprocessing(sample) for sample in samples]
185
+ return samples
186
+
187
+ def update_question(self) -> QuestionData:
188
+ """
189
+ Update the question data based on the current state.
190
+ Is called when the user navigates to a new question.
191
+ """
192
+ current_question = self.state.current_question
193
+ sample = self.state.samples[current_question]
194
+
195
+ question_num = (
196
+ f"### Spurning {current_question + 1} af {len(self.state.samples)}"
197
+ )
198
+ question = sample["question"]
199
+ options = sample.get("options")
200
+ answer = self.state.user_answers[current_question]
201
+ next_button_text = (
202
+ "Klára" if current_question == len(self.state.samples) - 1 else "Næsta"
203
+ )
204
+ previous_button_visibility = current_question > 0
205
+ instruction = sample.get("instruction", "")
206
+
207
+ return QuestionData(
208
+ question_num=question_num,
209
+ question=question,
210
+ options=options,
211
+ answer=answer,
212
+ next_button_text=next_button_text,
213
+ previous_button_visibility=previous_button_visibility,
214
+ instruction=instruction,
215
+ )
216
+
217
+ def next_question(self, answer: str) -> Dict[str, Any]:
218
+ """
219
+ Update the state with the user's answer to the current question.
220
+ If the quiz is not completed, return the next question data.
221
+ If the quiz is completed, return the score plot.
222
+ Is called when the user submits an answer.
223
+ """
224
+ self.state.user_answers[self.state.current_question] = answer
225
+ if self.state.current_question < len(self.state.samples) - 1:
226
+ self.state.current_question += 1
227
+ return {"completed": False, "question_data": self.update_question()}
228
+ else:
229
+ self.state.quiz_completed = True
230
+ user_score = self.calculate_score()
231
+ plot = self.plot_score(user_score)
232
+ return {"completed": True, "plot": plot}
233
+
234
+ def previous_question(self) -> QuestionData:
235
+ if self.state.current_question > 0:
236
+ self.state.current_question -= 1
237
+ return self.update_question()
238
+
239
+ def calculate_score(self) -> float:
240
+ if self.state.benchmark_name == "icelandic-wiki-qa":
241
+ queries = [sample["question"] for sample in self.state.samples]
242
+ return calculate_gpt4o_score(
243
+ queries, self.state.user_answers, self.state.correct_answers
244
+ )
245
+
246
+ score = sum(
247
+ user_answer == correct_answer
248
+ for user_answer, correct_answer in zip(
249
+ self.state.user_answers, self.state.correct_answers
250
+ )
251
+ )
252
+ return score / len(self.state.correct_answers)
253
+
254
+ def plot_score(self, user_score: float):
255
+ scores = {**BENCHMARK_SCORES[self.state.benchmark_name], "Þú": 100 * user_score}
256
+ # Sort by score
257
+ scores = dict(sorted(scores.items(), key=lambda item: item[1]))
258
+
259
+ # Define colors for user vs models
260
+ colors = {name: "tab:blue" for name in scores.keys()}
261
+ colors["Þú"] = "tab:green"
262
+
263
+ fig, ax = plt.subplots(figsize=(10, 6), dpi=250)
264
+ ax.spines[["left", "top", "right"]].set_visible(False)
265
+
266
+ ax.barh(
267
+ scores.keys(),
268
+ scores.values(),
269
+ height=0.6,
270
+ color=[colors[name] for name in scores.keys()],
271
+ )
272
+ ax.set_axisbelow(True)
273
+ ax.xaxis.grid(True, linestyle="--", alpha=0.6)
274
+ ax.set_title(f"{BENCHMARKS[self.state.benchmark_name]['name']}: Svona stóðstu þig miðað við mállíkönin", pad=20)
275
+ ax.set_xlabel("Stig (%)")
276
+ ax.set_xlim(0, 100)
277
+ plt.tight_layout()
278
+ return fig
score.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from openai import OpenAI
4
+ import re
5
+
6
+ # Model scores
7
+ BENCHMARK_SCORES = {
8
+ "icelandic-winogrande": {
9
+ "Claude 3.5 Sonnet": 90.4,
10
+ "GPT-4o": 85.4,
11
+ "GPT-4-turbo": 85.8,
12
+ "Hermes 3 Llama 3.1 405B fp8": 70.6,
13
+ "Claude 2.1": 55.1,
14
+ "GPT-3.5-turbo": 52.0,
15
+ },
16
+ "grammatical-error-detection": {
17
+ "Claude 3.5 Sonnet": 70.0,
18
+ "GPT-4o": 68.0,
19
+ "GPT-4-turbo": 60.5,
20
+ "Hermes 3 Llama 3.1 405B fp8": 53.5,
21
+ "Claude 2.1": 52.5,
22
+ "GPT-3.5-turbo": 52.0,
23
+ },
24
+ "icelandic-inflection-all": {
25
+ "Claude 3.5 Sonnet": 89.2,
26
+ "GPT-4o": 87.8,
27
+ "GPT-4-turbo": 76.6,
28
+ "Hermes 3 Llama 3.1 405B fp8": 61.8,
29
+ "Claude 2.1": 55.2,
30
+ "GPT-3.5-turbo": 39.1,
31
+ },
32
+ "belebele": {
33
+ "Claude 3.5 Sonnet": 92.0,
34
+ "GPT-4o": 90.4,
35
+ "GPT-4-turbo": 89.3,
36
+ "Hermes 3 Llama 3.1 405B fp8": 86.1,
37
+ "Claude 2.1": 42.1,
38
+ "GPT-3.5-turbo": 59.2,
39
+ },
40
+ "icelandic-arc-challenge": {
41
+ "Claude 3.5 Sonnet": 89.6,
42
+ "GPT-4o": 90.4,
43
+ "GPT-4-turbo": 88.7,
44
+ "Hermes 3 Llama 3.1 405B fp8": 72.0,
45
+ "Claude 2.1": 59.9,
46
+ "GPT-3.5-turbo": 49.5,
47
+ },
48
+ "icelandic-wiki-qa": {
49
+ "Claude 3.5 Sonnet": 44.7,
50
+ "GPT-4o": 38.0,
51
+ "GPT-4-turbo": 31.0,
52
+ "Hermes 3 Llama 3.1 405B fp8": 33.8,
53
+ "Claude 2.1": 21.1,
54
+ "GPT-3.5-turbo": 15.0,
55
+ },
56
+ }
57
+
58
+
59
+ client = OpenAI(
60
+ # This is the default and can be omitted
61
+ api_key=os.environ.get("OPENAI_API_KEY"),
62
+ )
63
+
64
+
65
+ def calculate_gpt4o_score(queries, user_answers, correct_answers):
66
+ """
67
+ Calculate the score for the Icelandic Wiki QA benchmark.
68
+ """
69
+ prompt = "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness. You will be given the question which was asked, a correct reference answer, and the assistant's answer. Begin your evaluation by briefly comparing the assistant's answer with the correct answer. Identify any mistakes. Be as objective as possible. Additional information beyond the reference answer's content should not be considered. If the assistant's answer is not in Icelandic but the reference answer is, you should rate the answer poorly. After providing your short explanation, you must rate the assistant's answer using the following scale: [[poor]]: Incorrect, off-topic or in a different language; [[fair]]: Partially aligns with the reference answer with some inaccuracies or irrelevant information; [[excellent]]: Accurate and relevant, matching the reference answer in content and language.\nProvide your rating strictly in this format: \"Rating: [[category]]\", for example: \"Rating: [[fair]]\".\n\n[Question]\n{query}\n\n[Start of Correct Answer]\n{answer}\n[End of Correct Answer]\n\n[Start of Assistant's Answer]\n{response}\n[End of Assistant's Answer]"
70
+ score_filter = re.compile(r"Rating: \[\[(.*?)\]\]")
71
+ scores = []
72
+
73
+ for query, u_answer, c_answer in zip(queries, user_answers, correct_answers):
74
+ chat_completion = client.chat.completions.create(
75
+ messages=[
76
+ {
77
+ "role": "user",
78
+ "content": prompt.format(
79
+ query=query, answer=c_answer, response=u_answer
80
+ ),
81
+ }
82
+ ],
83
+ model="gpt-4o",
84
+ max_completion_tokens=200,
85
+ )
86
+ chat = chat_completion.choices[0].message.content or ""
87
+ score = score_filter.search(chat).group(1).lower()
88
+ scores.append(score)
89
+
90
+ return sum(
91
+ 1 if score == "excellent" else 0.5 if score == "fair" else 0 for score in scores
92
+ ) / len(scores)