Spaces:

qanta-challenge
/

quizbowl-submission

Running

File size: 9,766 Bytes

193db9d

# %%
import json
import os
import time
from typing import Dict, Iterable, List, Optional, Tuple, Union

import litellm
from datasets import load_dataset
from litellm import completion

litellm.drop_params = True

# Set your API key - you can replace this with your actual key or use environment variables
os.environ["OPENAI_API_KEY"] = (
    "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"
)

DEFAULT_SYS_PROMPT = """
You are a Quizbowl expert. You will be given a question that's progressively revealed.
Your goal is to identify the answer as quickly as possible with high confidence.
Respond with a JSON object with two fields:
1. "answer": Your best guess for the answer
2. "confidence": Your confidence in your answer from 0.0 to 1.0

DO NOT include any explanation. ONLY return the JSON object.
"""


class QuizbowlAgent:
    """
    An agent for playing Quizbowl with two modes:
    1. Tossup mode: Fast and direct with confidence calibration for buzzing
    2. Bonus round mode: Provides guess, rationale, and confidence
    """

    def __init__(
        self,
        model: str = "gpt-4o-mini",
        buzz_threshold: float = 0.85,
        temperature: float = 0.2,
        system_prompt: str = DEFAULT_SYS_PROMPT,
    ):
        """
        Initialize the QuizbowlAgent.

        Args:
            model: The LLM model to use for answering
            buzz_threshold: Confidence threshold for buzzing in tossup mode (0-1)
            temperature: Temperature for model sampling
        """
        self.model = model
        self.buzz_threshold = buzz_threshold
        self.temperature = temperature
        self.system_prompt = system_prompt

    def _process_question_runs(self, question_runs: List[str]) -> List[str]:
        """Process question runs to extract increasing amounts of text."""
        # For simpler testing, just return the runs as they are in the dataset
        return question_runs

    def _get_agent_response(self, prompt: str, system_prompt: str) -> Dict:
        """Get response from the LLM model."""
        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]

        start_time = time.time()
        response = completion(
            model=self.model,
            messages=messages,
            temperature=self.temperature,
            max_tokens=150,  # Limit token usage for faster responses
        )
        response_time = time.time() - start_time

        return response, response_time

    def _extract_confidence_and_answer(self, content: str) -> Tuple[str, float]:
        """Extract the answer and confidence score from the model response."""
        try:
            # Try to parse JSON from the response
            data = json.loads(content)
            answer = data.get("answer", "")
            confidence = float(data.get("confidence", 0.0))
            return answer, confidence
        except (json.JSONDecodeError, ValueError):
            # Fallback if parsing fails
            lines = content.strip().split("\n")
            answer = lines[0] if lines else ""
            confidence = 0.5  # Default confidence

            # Try to extract confidence from text
            for line in lines:
                if "confidence:" in line.lower():
                    try:
                        confidence = float(line.lower().split("confidence:")[1].strip())
                    except (ValueError, IndexError):
                        pass

            return answer, confidence

    def tossup_mode(self, question_runs: List[str]) -> Iterable[Dict]:
        """
        Process a tossup question and decide when to buzz based on confidence.

        Args:
            question_runs: Progressive reveals of the question text

        Yields:
            Dict with answer, confidence, and whether to buzz
        """

        for i, question_text in enumerate(question_runs):
            prompt = f"Question: {question_text}\n\nProvide your answer and confidence level:"

            response, response_time = self._get_agent_response(prompt, DEFAULT_SYS_PROMPT)
            content = response.choices[0].message.content

            answer, confidence = self._extract_confidence_and_answer(content)

            result = {
                "answer": answer,
                "confidence": confidence,
                "buzz": confidence >= self.buzz_threshold,
                "question_fragment": question_text,
                "position": i + 1,
                "full_response": content,
                "response_time": response_time,
            }

            yield result

            # If we've reached the confidence threshold, buzz and stop
            if confidence >= self.buzz_threshold:
                return

    def tossup_mode_top5(self, question_runs: List[str]) -> Iterable[Dict]:
        """
        Process a tossup question and provide the top 5 guesses with confidence levels.

        Args:
            question_runs: Progressive reveals of the question text

        Returns:
            Dict with top 5 answers, their confidences, and whether to buzz
        """

        for i, question_text in enumerate(question_runs):
            prompt = f"Question: {question_text}\n\nProvide your top 5 answers and confidence levels."

            response, response_time = self._get_agent_response(prompt, self.system_prompt)
            content = response.choices[0].message.content

            try:
                # Try to parse JSON from the response
                data = json.loads(content)
                guesses = data.get("guesses", [])
            except (json.JSONDecodeError, ValueError):
                # Fallback if parsing fails
                guesses = []

            result = {
                "guesses": guesses,
                "buzz": any(guess["confidence"] >= self.buzz_threshold for guess in guesses),
                "question_fragment": question_text,
                "position": i + 1,
                "full_response": content,
                "response_time": response_time,
            }

            yield result

            # If any guess reaches the confidence threshold, buzz and stop
            if result["buzz"]:
                return

    def bonus_round_mode(self, question: str) -> Dict:
        """
        Process a bonus round question with detailed analysis.

        Args:
            question: The bonus question text

        Returns:
            Dict with answer, rationale, and confidence
        """
        system_prompt = """
        You are a Quizbowl expert answering a bonus question. Provide:
        1. Your direct answer
        2. A very brief and crisp one line rationale for your answer (key clues that led to it)
        3. Your confidence level (0.0-1.0)
        
        Respond with a JSON object with these three fields:
        {
            "answer": "Your answer here",
            "rationale": "Your reasoning here",
            "confidence": 0.XX
        }
        """

        prompt = f"Bonus Question: {question}\n\nProvide your answer, rationale, and confidence:"

        response = self._get_agent_response(prompt, system_prompt)
        content = response.choices[0].message.content

        try:
            # Try to parse JSON
            result = json.loads(content)
            # Ensure all fields are present
            if not all(k in result for k in ["answer", "rationale", "confidence"]):
                raise ValueError("Missing fields in response")
        except (json.JSONDecodeError, ValueError):
            # If parsing fails, extract manually
            lines = content.strip().split("\n")
            result = {"answer": "", "rationale": "", "confidence": 0.5}

            for line in lines:
                if line.lower().startswith("answer:"):
                    result["answer"] = line[7:].strip()
                elif line.lower().startswith("rationale:"):
                    result["rationale"] = line[10:].strip()
                elif line.lower().startswith("confidence:"):
                    try:
                        result["confidence"] = float(line[11:].strip())
                    except ValueError:
                        pass

        return result


# %%
# Example usage
if __name__ == "__main__":
    # Load the Quizbowl dataset
    ds_name = "umdclip/leaderboard_co_set"
    ds = load_dataset(ds_name, split="train")

    # Create the agent
    agent = QuizbowlAgent(model="gpt-4-turbo", buzz_threshold=0.85)

    # Example for tossup mode
    print("\n=== TOSSUP MODE EXAMPLE ===")
    sample_question = ds[0]
    print(sample_question["question_runs"][-1])
    print(sample_question["gold_label"])
    question_runs = sample_question["question_runs"]

    results = agent.tossup_mode(question_runs)
    for result in results:
        print(f"Guess at position {result['position']}: {result['answer']}")
        print(f"Confidence: {result['confidence']}")
        if result["buzz"]:
            print("Buzzed!\n")

    results = agent.tossup_mode_top5(question_runs)
    for result in results:
        guesses = [f"{guess['answer']} ({guess['confidence']})" for guess in result["guesses"]]
        print(f"Guesses at position {result['position']}: {', '.join(guesses)}")
        if result["buzz"]:
            print("Buzzed!")

    # Example for bonus round mode
    print("\n=== BONUS ROUND MODE EXAMPLE ===")
    bonus_question = sample_question["question_runs"][-1]

    bonus_result = agent.bonus_round_mode(bonus_question)
    print(f"Answer: {bonus_result['answer']}")
    print(f"Rationale: {bonus_result['rationale']}")
    print(f"Confidence: {bonus_result['confidence']}")

# %%