quizbowl-submission / src /workflows /quizbowl_agent.py
Maharshi Gor
First Working commit
193db9d
raw
history blame
9.77 kB
# %%
import json
import os
import time
from typing import Dict, Iterable, List, Optional, Tuple, Union
import litellm
from datasets import load_dataset
from litellm import completion
litellm.drop_params = True
# Set your API key - you can replace this with your actual key or use environment variables
os.environ["OPENAI_API_KEY"] = (
"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"
)
DEFAULT_SYS_PROMPT = """
You are a Quizbowl expert. You will be given a question that's progressively revealed.
Your goal is to identify the answer as quickly as possible with high confidence.
Respond with a JSON object with two fields:
1. "answer": Your best guess for the answer
2. "confidence": Your confidence in your answer from 0.0 to 1.0
DO NOT include any explanation. ONLY return the JSON object.
"""
class QuizbowlAgent:
"""
An agent for playing Quizbowl with two modes:
1. Tossup mode: Fast and direct with confidence calibration for buzzing
2. Bonus round mode: Provides guess, rationale, and confidence
"""
def __init__(
self,
model: str = "gpt-4o-mini",
buzz_threshold: float = 0.85,
temperature: float = 0.2,
system_prompt: str = DEFAULT_SYS_PROMPT,
):
"""
Initialize the QuizbowlAgent.
Args:
model: The LLM model to use for answering
buzz_threshold: Confidence threshold for buzzing in tossup mode (0-1)
temperature: Temperature for model sampling
"""
self.model = model
self.buzz_threshold = buzz_threshold
self.temperature = temperature
self.system_prompt = system_prompt
def _process_question_runs(self, question_runs: List[str]) -> List[str]:
"""Process question runs to extract increasing amounts of text."""
# For simpler testing, just return the runs as they are in the dataset
return question_runs
def _get_agent_response(self, prompt: str, system_prompt: str) -> Dict:
"""Get response from the LLM model."""
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
start_time = time.time()
response = completion(
model=self.model,
messages=messages,
temperature=self.temperature,
max_tokens=150, # Limit token usage for faster responses
)
response_time = time.time() - start_time
return response, response_time
def _extract_confidence_and_answer(self, content: str) -> Tuple[str, float]:
"""Extract the answer and confidence score from the model response."""
try:
# Try to parse JSON from the response
data = json.loads(content)
answer = data.get("answer", "")
confidence = float(data.get("confidence", 0.0))
return answer, confidence
except (json.JSONDecodeError, ValueError):
# Fallback if parsing fails
lines = content.strip().split("\n")
answer = lines[0] if lines else ""
confidence = 0.5 # Default confidence
# Try to extract confidence from text
for line in lines:
if "confidence:" in line.lower():
try:
confidence = float(line.lower().split("confidence:")[1].strip())
except (ValueError, IndexError):
pass
return answer, confidence
def tossup_mode(self, question_runs: List[str]) -> Iterable[Dict]:
"""
Process a tossup question and decide when to buzz based on confidence.
Args:
question_runs: Progressive reveals of the question text
Yields:
Dict with answer, confidence, and whether to buzz
"""
for i, question_text in enumerate(question_runs):
prompt = f"Question: {question_text}\n\nProvide your answer and confidence level:"
response, response_time = self._get_agent_response(prompt, DEFAULT_SYS_PROMPT)
content = response.choices[0].message.content
answer, confidence = self._extract_confidence_and_answer(content)
result = {
"answer": answer,
"confidence": confidence,
"buzz": confidence >= self.buzz_threshold,
"question_fragment": question_text,
"position": i + 1,
"full_response": content,
"response_time": response_time,
}
yield result
# If we've reached the confidence threshold, buzz and stop
if confidence >= self.buzz_threshold:
return
def tossup_mode_top5(self, question_runs: List[str]) -> Iterable[Dict]:
"""
Process a tossup question and provide the top 5 guesses with confidence levels.
Args:
question_runs: Progressive reveals of the question text
Returns:
Dict with top 5 answers, their confidences, and whether to buzz
"""
for i, question_text in enumerate(question_runs):
prompt = f"Question: {question_text}\n\nProvide your top 5 answers and confidence levels."
response, response_time = self._get_agent_response(prompt, self.system_prompt)
content = response.choices[0].message.content
try:
# Try to parse JSON from the response
data = json.loads(content)
guesses = data.get("guesses", [])
except (json.JSONDecodeError, ValueError):
# Fallback if parsing fails
guesses = []
result = {
"guesses": guesses,
"buzz": any(guess["confidence"] >= self.buzz_threshold for guess in guesses),
"question_fragment": question_text,
"position": i + 1,
"full_response": content,
"response_time": response_time,
}
yield result
# If any guess reaches the confidence threshold, buzz and stop
if result["buzz"]:
return
def bonus_round_mode(self, question: str) -> Dict:
"""
Process a bonus round question with detailed analysis.
Args:
question: The bonus question text
Returns:
Dict with answer, rationale, and confidence
"""
system_prompt = """
You are a Quizbowl expert answering a bonus question. Provide:
1. Your direct answer
2. A very brief and crisp one line rationale for your answer (key clues that led to it)
3. Your confidence level (0.0-1.0)
Respond with a JSON object with these three fields:
{
"answer": "Your answer here",
"rationale": "Your reasoning here",
"confidence": 0.XX
}
"""
prompt = f"Bonus Question: {question}\n\nProvide your answer, rationale, and confidence:"
response = self._get_agent_response(prompt, system_prompt)
content = response.choices[0].message.content
try:
# Try to parse JSON
result = json.loads(content)
# Ensure all fields are present
if not all(k in result for k in ["answer", "rationale", "confidence"]):
raise ValueError("Missing fields in response")
except (json.JSONDecodeError, ValueError):
# If parsing fails, extract manually
lines = content.strip().split("\n")
result = {"answer": "", "rationale": "", "confidence": 0.5}
for line in lines:
if line.lower().startswith("answer:"):
result["answer"] = line[7:].strip()
elif line.lower().startswith("rationale:"):
result["rationale"] = line[10:].strip()
elif line.lower().startswith("confidence:"):
try:
result["confidence"] = float(line[11:].strip())
except ValueError:
pass
return result
# %%
# Example usage
if __name__ == "__main__":
# Load the Quizbowl dataset
ds_name = "umdclip/leaderboard_co_set"
ds = load_dataset(ds_name, split="train")
# Create the agent
agent = QuizbowlAgent(model="gpt-4-turbo", buzz_threshold=0.85)
# Example for tossup mode
print("\n=== TOSSUP MODE EXAMPLE ===")
sample_question = ds[0]
print(sample_question["question_runs"][-1])
print(sample_question["gold_label"])
question_runs = sample_question["question_runs"]
results = agent.tossup_mode(question_runs)
for result in results:
print(f"Guess at position {result['position']}: {result['answer']}")
print(f"Confidence: {result['confidence']}")
if result["buzz"]:
print("Buzzed!\n")
results = agent.tossup_mode_top5(question_runs)
for result in results:
guesses = [f"{guess['answer']} ({guess['confidence']})" for guess in result["guesses"]]
print(f"Guesses at position {result['position']}: {', '.join(guesses)}")
if result["buzz"]:
print("Buzzed!")
# Example for bonus round mode
print("\n=== BONUS ROUND MODE EXAMPLE ===")
bonus_question = sample_question["question_runs"][-1]
bonus_result = agent.bonus_round_mode(bonus_question)
print(f"Answer: {bonus_result['answer']}")
print(f"Rationale: {bonus_result['rationale']}")
print(f"Confidence: {bonus_result['confidence']}")
# %%