import os from openai import OpenAI import re # Model scores BENCHMARK_SCORES = { "icelandic-winogrande": { "Claude 3.5 Sonnet": 90.4, "GPT-4o": 85.4, "GPT-4-turbo": 85.8, "Hermes 3 Llama 3.1 405B fp8": 70.6, "Claude 2.1": 55.1, "GPT-3.5-turbo": 52.0, }, "grammatical-error-detection": { "Claude 3.5 Sonnet": 70.0, "GPT-4o": 68.0, "GPT-4-turbo": 60.5, "Hermes 3 Llama 3.1 405B fp8": 53.5, "Claude 2.1": 52.5, "GPT-3.5-turbo": 52.0, }, "icelandic-inflection-all": { "Claude 3.5 Sonnet": 89.2, "GPT-4o": 87.8, "GPT-4-turbo": 76.6, "Hermes 3 Llama 3.1 405B fp8": 61.8, "Claude 2.1": 55.2, "GPT-3.5-turbo": 39.1, }, "icelandic-belebele": { "Claude 3.5 Sonnet": 92.0, "GPT-4o": 90.4, "GPT-4-turbo": 89.3, "Hermes 3 Llama 3.1 405B fp8": 86.1, "Claude 2.1": 42.1, "GPT-3.5-turbo": 59.2, }, "icelandic-arc-challenge": { "Claude 3.5 Sonnet": 89.6, "GPT-4o": 90.4, "GPT-4-turbo": 88.7, "Hermes 3 Llama 3.1 405B fp8": 72.0, "Claude 2.1": 59.9, "GPT-3.5-turbo": 49.5, }, "icelandic-wiki-qa": { "Claude 3.5 Sonnet": 44.7, "GPT-4o": 38.0, "GPT-4-turbo": 31.0, "Hermes 3 Llama 3.1 405B fp8": 33.8, "Claude 2.1": 21.1, "GPT-3.5-turbo": 15.0, }, } client = OpenAI( # This is the default and can be omitted api_key=os.environ.get("OPENAI_API_KEY"), ) def calculate_gpt4o_score(queries, user_answers, correct_answers): """ Calculate the score for the Icelandic Wiki QA benchmark. """ prompt = "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness. You will be given the question which was asked, a correct reference answer, and the assistant's answer. Begin your evaluation by briefly comparing the assistant's answer with the correct answer. Identify any mistakes. Be as objective as possible. Additional information beyond the reference answer's content should not be considered. If the assistant's answer is not in Icelandic but the reference answer is, you should rate the answer poorly. After providing your short explanation, you must rate the assistant's answer using the following scale: [[poor]]: Incorrect, off-topic or in a different language; [[fair]]: Partially aligns with the reference answer with some inaccuracies or irrelevant information; [[excellent]]: Accurate and relevant, matching the reference answer in content and language.\nProvide your rating strictly in this format: \"Rating: [[category]]\", for example: \"Rating: [[fair]]\".\n\n[Question]\n{query}\n\n[Start of Correct Answer]\n{answer}\n[End of Correct Answer]\n\n[Start of Assistant's Answer]\n{response}\n[End of Assistant's Answer]" score_filter = re.compile(r"Rating: \[\[(.*?)\]\]") scores = [] for query, u_answer, c_answer in zip(queries, user_answers, correct_answers): chat_completion = client.chat.completions.create( messages=[ { "role": "user", "content": prompt.format( query=query, answer=c_answer, response=u_answer ), } ], model="gpt-4o", max_completion_tokens=200, ) chat = chat_completion.choices[0].message.content or "" score = score_filter.search(chat).group(1).lower() scores.append(score) return sum( 1 if score == "excellent" else 0.5 if score == "fair" else 0 for score in scores ) / len(scores)