|
|
|
import json |
|
import os |
|
import time |
|
from typing import Dict, Iterable, List, Optional, Tuple, Union |
|
|
|
import litellm |
|
from datasets import load_dataset |
|
from litellm import completion |
|
|
|
litellm.drop_params = True |
|
|
|
|
|
os.environ["OPENAI_API_KEY"] = ( |
|
"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" |
|
) |
|
|
|
DEFAULT_SYS_PROMPT = """ |
|
You are a Quizbowl expert. You will be given a question that's progressively revealed. |
|
Your goal is to identify the answer as quickly as possible with high confidence. |
|
Respond with a JSON object with two fields: |
|
1. "answer": Your best guess for the answer |
|
2. "confidence": Your confidence in your answer from 0.0 to 1.0 |
|
|
|
DO NOT include any explanation. ONLY return the JSON object. |
|
""" |
|
|
|
|
|
class QuizbowlAgent: |
|
""" |
|
An agent for playing Quizbowl with two modes: |
|
1. Tossup mode: Fast and direct with confidence calibration for buzzing |
|
2. Bonus round mode: Provides guess, rationale, and confidence |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model: str = "gpt-4o-mini", |
|
buzz_threshold: float = 0.85, |
|
temperature: float = 0.2, |
|
system_prompt: str = DEFAULT_SYS_PROMPT, |
|
): |
|
""" |
|
Initialize the QuizbowlAgent. |
|
|
|
Args: |
|
model: The LLM model to use for answering |
|
buzz_threshold: Confidence threshold for buzzing in tossup mode (0-1) |
|
temperature: Temperature for model sampling |
|
""" |
|
self.model = model |
|
self.buzz_threshold = buzz_threshold |
|
self.temperature = temperature |
|
self.system_prompt = system_prompt |
|
|
|
def _process_question_runs(self, question_runs: List[str]) -> List[str]: |
|
"""Process question runs to extract increasing amounts of text.""" |
|
|
|
return question_runs |
|
|
|
def _get_agent_response(self, prompt: str, system_prompt: str) -> Dict: |
|
"""Get response from the LLM model.""" |
|
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}] |
|
|
|
start_time = time.time() |
|
response = completion( |
|
model=self.model, |
|
messages=messages, |
|
temperature=self.temperature, |
|
max_tokens=150, |
|
) |
|
response_time = time.time() - start_time |
|
|
|
return response, response_time |
|
|
|
def _extract_confidence_and_answer(self, content: str) -> Tuple[str, float]: |
|
"""Extract the answer and confidence score from the model response.""" |
|
try: |
|
|
|
data = json.loads(content) |
|
answer = data.get("answer", "") |
|
confidence = float(data.get("confidence", 0.0)) |
|
return answer, confidence |
|
except (json.JSONDecodeError, ValueError): |
|
|
|
lines = content.strip().split("\n") |
|
answer = lines[0] if lines else "" |
|
confidence = 0.5 |
|
|
|
|
|
for line in lines: |
|
if "confidence:" in line.lower(): |
|
try: |
|
confidence = float(line.lower().split("confidence:")[1].strip()) |
|
except (ValueError, IndexError): |
|
pass |
|
|
|
return answer, confidence |
|
|
|
def tossup_mode(self, question_runs: List[str]) -> Iterable[Dict]: |
|
""" |
|
Process a tossup question and decide when to buzz based on confidence. |
|
|
|
Args: |
|
question_runs: Progressive reveals of the question text |
|
|
|
Yields: |
|
Dict with answer, confidence, and whether to buzz |
|
""" |
|
|
|
for i, question_text in enumerate(question_runs): |
|
prompt = f"Question: {question_text}\n\nProvide your answer and confidence level:" |
|
|
|
response, response_time = self._get_agent_response(prompt, DEFAULT_SYS_PROMPT) |
|
content = response.choices[0].message.content |
|
|
|
answer, confidence = self._extract_confidence_and_answer(content) |
|
|
|
result = { |
|
"answer": answer, |
|
"confidence": confidence, |
|
"buzz": confidence >= self.buzz_threshold, |
|
"question_fragment": question_text, |
|
"position": i + 1, |
|
"full_response": content, |
|
"response_time": response_time, |
|
} |
|
|
|
yield result |
|
|
|
|
|
if confidence >= self.buzz_threshold: |
|
return |
|
|
|
def tossup_mode_top5(self, question_runs: List[str]) -> Iterable[Dict]: |
|
""" |
|
Process a tossup question and provide the top 5 guesses with confidence levels. |
|
|
|
Args: |
|
question_runs: Progressive reveals of the question text |
|
|
|
Returns: |
|
Dict with top 5 answers, their confidences, and whether to buzz |
|
""" |
|
|
|
for i, question_text in enumerate(question_runs): |
|
prompt = f"Question: {question_text}\n\nProvide your top 5 answers and confidence levels." |
|
|
|
response, response_time = self._get_agent_response(prompt, self.system_prompt) |
|
content = response.choices[0].message.content |
|
|
|
try: |
|
|
|
data = json.loads(content) |
|
guesses = data.get("guesses", []) |
|
except (json.JSONDecodeError, ValueError): |
|
|
|
guesses = [] |
|
|
|
result = { |
|
"guesses": guesses, |
|
"buzz": any(guess["confidence"] >= self.buzz_threshold for guess in guesses), |
|
"question_fragment": question_text, |
|
"position": i + 1, |
|
"full_response": content, |
|
"response_time": response_time, |
|
} |
|
|
|
yield result |
|
|
|
|
|
if result["buzz"]: |
|
return |
|
|
|
def bonus_round_mode(self, question: str) -> Dict: |
|
""" |
|
Process a bonus round question with detailed analysis. |
|
|
|
Args: |
|
question: The bonus question text |
|
|
|
Returns: |
|
Dict with answer, rationale, and confidence |
|
""" |
|
system_prompt = """ |
|
You are a Quizbowl expert answering a bonus question. Provide: |
|
1. Your direct answer |
|
2. A very brief and crisp one line rationale for your answer (key clues that led to it) |
|
3. Your confidence level (0.0-1.0) |
|
|
|
Respond with a JSON object with these three fields: |
|
{ |
|
"answer": "Your answer here", |
|
"rationale": "Your reasoning here", |
|
"confidence": 0.XX |
|
} |
|
""" |
|
|
|
prompt = f"Bonus Question: {question}\n\nProvide your answer, rationale, and confidence:" |
|
|
|
response = self._get_agent_response(prompt, system_prompt) |
|
content = response.choices[0].message.content |
|
|
|
try: |
|
|
|
result = json.loads(content) |
|
|
|
if not all(k in result for k in ["answer", "rationale", "confidence"]): |
|
raise ValueError("Missing fields in response") |
|
except (json.JSONDecodeError, ValueError): |
|
|
|
lines = content.strip().split("\n") |
|
result = {"answer": "", "rationale": "", "confidence": 0.5} |
|
|
|
for line in lines: |
|
if line.lower().startswith("answer:"): |
|
result["answer"] = line[7:].strip() |
|
elif line.lower().startswith("rationale:"): |
|
result["rationale"] = line[10:].strip() |
|
elif line.lower().startswith("confidence:"): |
|
try: |
|
result["confidence"] = float(line[11:].strip()) |
|
except ValueError: |
|
pass |
|
|
|
return result |
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
ds_name = "umdclip/leaderboard_co_set" |
|
ds = load_dataset(ds_name, split="train") |
|
|
|
|
|
agent = QuizbowlAgent(model="gpt-4-turbo", buzz_threshold=0.85) |
|
|
|
|
|
print("\n=== TOSSUP MODE EXAMPLE ===") |
|
sample_question = ds[0] |
|
print(sample_question["question_runs"][-1]) |
|
print(sample_question["gold_label"]) |
|
question_runs = sample_question["question_runs"] |
|
|
|
results = agent.tossup_mode(question_runs) |
|
for result in results: |
|
print(f"Guess at position {result['position']}: {result['answer']}") |
|
print(f"Confidence: {result['confidence']}") |
|
if result["buzz"]: |
|
print("Buzzed!\n") |
|
|
|
results = agent.tossup_mode_top5(question_runs) |
|
for result in results: |
|
guesses = [f"{guess['answer']} ({guess['confidence']})" for guess in result["guesses"]] |
|
print(f"Guesses at position {result['position']}: {', '.join(guesses)}") |
|
if result["buzz"]: |
|
print("Buzzed!") |
|
|
|
|
|
print("\n=== BONUS ROUND MODE EXAMPLE ===") |
|
bonus_question = sample_question["question_runs"][-1] |
|
|
|
bonus_result = agent.bonus_round_mode(bonus_question) |
|
print(f"Answer: {bonus_result['answer']}") |
|
print(f"Rationale: {bonus_result['rationale']}") |
|
print(f"Confidence: {bonus_result['confidence']}") |
|
|
|
|
|
|