# %% import json import os import time from typing import Dict, Iterable, List, Optional, Tuple, Union import litellm from datasets import load_dataset from litellm import completion litellm.drop_params = True # Set your API key - you can replace this with your actual key or use environment variables os.environ["OPENAI_API_KEY"] = ( "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" ) DEFAULT_SYS_PROMPT = """ You are a Quizbowl expert. You will be given a question that's progressively revealed. Your goal is to identify the answer as quickly as possible with high confidence. Respond with a JSON object with two fields: 1. "answer": Your best guess for the answer 2. "confidence": Your confidence in your answer from 0.0 to 1.0 DO NOT include any explanation. ONLY return the JSON object. """ class QuizbowlAgent: """ An agent for playing Quizbowl with two modes: 1. Tossup mode: Fast and direct with confidence calibration for buzzing 2. Bonus round mode: Provides guess, rationale, and confidence """ def __init__( self, model: str = "gpt-4o-mini", buzz_threshold: float = 0.85, temperature: float = 0.2, system_prompt: str = DEFAULT_SYS_PROMPT, ): """ Initialize the QuizbowlAgent. Args: model: The LLM model to use for answering buzz_threshold: Confidence threshold for buzzing in tossup mode (0-1) temperature: Temperature for model sampling """ self.model = model self.buzz_threshold = buzz_threshold self.temperature = temperature self.system_prompt = system_prompt def _process_question_runs(self, question_runs: List[str]) -> List[str]: """Process question runs to extract increasing amounts of text.""" # For simpler testing, just return the runs as they are in the dataset return question_runs def _get_agent_response(self, prompt: str, system_prompt: str) -> Dict: """Get response from the LLM model.""" messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}] start_time = time.time() response = completion( model=self.model, messages=messages, temperature=self.temperature, max_tokens=150, # Limit token usage for faster responses ) response_time = time.time() - start_time return response, response_time def _extract_confidence_and_answer(self, content: str) -> Tuple[str, float]: """Extract the answer and confidence score from the model response.""" try: # Try to parse JSON from the response data = json.loads(content) answer = data.get("answer", "") confidence = float(data.get("confidence", 0.0)) return answer, confidence except (json.JSONDecodeError, ValueError): # Fallback if parsing fails lines = content.strip().split("\n") answer = lines[0] if lines else "" confidence = 0.5 # Default confidence # Try to extract confidence from text for line in lines: if "confidence:" in line.lower(): try: confidence = float(line.lower().split("confidence:")[1].strip()) except (ValueError, IndexError): pass return answer, confidence def tossup_mode(self, question_runs: List[str]) -> Iterable[Dict]: """ Process a tossup question and decide when to buzz based on confidence. Args: question_runs: Progressive reveals of the question text Yields: Dict with answer, confidence, and whether to buzz """ for i, question_text in enumerate(question_runs): prompt = f"Question: {question_text}\n\nProvide your answer and confidence level:" response, response_time = self._get_agent_response(prompt, DEFAULT_SYS_PROMPT) content = response.choices[0].message.content answer, confidence = self._extract_confidence_and_answer(content) result = { "answer": answer, "confidence": confidence, "buzz": confidence >= self.buzz_threshold, "question_fragment": question_text, "position": i + 1, "full_response": content, "response_time": response_time, } yield result # If we've reached the confidence threshold, buzz and stop if confidence >= self.buzz_threshold: return def tossup_mode_top5(self, question_runs: List[str]) -> Iterable[Dict]: """ Process a tossup question and provide the top 5 guesses with confidence levels. Args: question_runs: Progressive reveals of the question text Returns: Dict with top 5 answers, their confidences, and whether to buzz """ for i, question_text in enumerate(question_runs): prompt = f"Question: {question_text}\n\nProvide your top 5 answers and confidence levels." response, response_time = self._get_agent_response(prompt, self.system_prompt) content = response.choices[0].message.content try: # Try to parse JSON from the response data = json.loads(content) guesses = data.get("guesses", []) except (json.JSONDecodeError, ValueError): # Fallback if parsing fails guesses = [] result = { "guesses": guesses, "buzz": any(guess["confidence"] >= self.buzz_threshold for guess in guesses), "question_fragment": question_text, "position": i + 1, "full_response": content, "response_time": response_time, } yield result # If any guess reaches the confidence threshold, buzz and stop if result["buzz"]: return def bonus_round_mode(self, question: str) -> Dict: """ Process a bonus round question with detailed analysis. Args: question: The bonus question text Returns: Dict with answer, rationale, and confidence """ system_prompt = """ You are a Quizbowl expert answering a bonus question. Provide: 1. Your direct answer 2. A very brief and crisp one line rationale for your answer (key clues that led to it) 3. Your confidence level (0.0-1.0) Respond with a JSON object with these three fields: { "answer": "Your answer here", "rationale": "Your reasoning here", "confidence": 0.XX } """ prompt = f"Bonus Question: {question}\n\nProvide your answer, rationale, and confidence:" response = self._get_agent_response(prompt, system_prompt) content = response.choices[0].message.content try: # Try to parse JSON result = json.loads(content) # Ensure all fields are present if not all(k in result for k in ["answer", "rationale", "confidence"]): raise ValueError("Missing fields in response") except (json.JSONDecodeError, ValueError): # If parsing fails, extract manually lines = content.strip().split("\n") result = {"answer": "", "rationale": "", "confidence": 0.5} for line in lines: if line.lower().startswith("answer:"): result["answer"] = line[7:].strip() elif line.lower().startswith("rationale:"): result["rationale"] = line[10:].strip() elif line.lower().startswith("confidence:"): try: result["confidence"] = float(line[11:].strip()) except ValueError: pass return result # %% # Example usage if __name__ == "__main__": # Load the Quizbowl dataset ds_name = "umdclip/leaderboard_co_set" ds = load_dataset(ds_name, split="train") # Create the agent agent = QuizbowlAgent(model="gpt-4-turbo", buzz_threshold=0.85) # Example for tossup mode print("\n=== TOSSUP MODE EXAMPLE ===") sample_question = ds[0] print(sample_question["question_runs"][-1]) print(sample_question["gold_label"]) question_runs = sample_question["question_runs"] results = agent.tossup_mode(question_runs) for result in results: print(f"Guess at position {result['position']}: {result['answer']}") print(f"Confidence: {result['confidence']}") if result["buzz"]: print("Buzzed!\n") results = agent.tossup_mode_top5(question_runs) for result in results: guesses = [f"{guess['answer']} ({guess['confidence']})" for guess in result["guesses"]] print(f"Guesses at position {result['position']}: {', '.join(guesses)}") if result["buzz"]: print("Buzzed!") # Example for bonus round mode print("\n=== BONUS ROUND MODE EXAMPLE ===") bonus_question = sample_question["question_runs"][-1] bonus_result = agent.bonus_round_mode(bonus_question) print(f"Answer: {bonus_result['answer']}") print(f"Rationale: {bonus_result['rationale']}") print(f"Confidence: {bonus_result['confidence']}") # %%