Spaces:

qanta-challenge
/

quizbowl-submission

Running

App Files Files Community

Maharshi Gor commited on May 26

Commit

ee5d50c

1 Parent(s): a4396db

Updated UI for bonus questions and playground dataset reference

Browse files

Files changed (8) hide show

app.py +3 -8
requirements.txt +1 -0
shared/workflows +1 -1
src/components/quizbowl/bonus.py +11 -10
src/components/quizbowl/plotting.py +34 -1
src/display/css_html_js.py +11 -0
src/display/custom_css.py +52 -3
src/envs.py +1 -4

app.py CHANGED Viewed

@@ -29,7 +29,7 @@ from envs import (
     LEADERBOARD_REFRESH_INTERVAL,
     LEADERBOARD_URL,
     LOG_LEVEL,
-    PLAYGROUND_DATASET_NAMES,
     QUEUE_REPO,
     REGISTRATION_URL,
     REPO_ID,
@@ -59,13 +59,8 @@ def filter_qids(qid: str, packet_ids: list[int]) -> bool:
     return packet_id in packet_ids
-def load_dataset(mode: str, max_questions: int = 10):
-    if mode == "tossup":
-        ds = datasets.load_dataset(PLAYGROUND_DATASET_NAMES["tossup"], split="eval")
-    elif mode == "bonus":
-        ds = datasets.load_dataset(PLAYGROUND_DATASET_NAMES["bonus"], split="eval")
-    else:
-        raise ValueError(f"Invalid mode: {mode}")
     return ds.filter(lambda x: filter_qids(x["qid"], [1])).select(range(max_questions))

     LEADERBOARD_REFRESH_INTERVAL,
     LEADERBOARD_URL,
     LOG_LEVEL,
+    PLAYGROUND_DATASET,
     QUEUE_REPO,
     REGISTRATION_URL,
     REPO_ID,
     return packet_id in packet_ids
+def load_dataset(config_name: str, max_questions: int = 10):
+    ds = datasets.load_dataset(PLAYGROUND_DATASET, config_name, split="eval")
     return ds.filter(lambda x: filter_qids(x["qid"], [1])).select(range(max_questions))

requirements.txt CHANGED Viewed

@@ -28,4 +28,5 @@ langchain-openai
 langchain-cohere
 langchain-deepseek
 json_repair
 loguru

 langchain-cohere
 langchain-deepseek
 json_repair
+unidecode
 loguru

shared/workflows CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~7f0d4f60746e4911abd1af80d8fbce1fff906549~~


1	+ Subproject commit 5a20959b241e3e73ed3112fb263ad51a8f63e381

src/components/quizbowl/bonus.py CHANGED Viewed

@@ -42,7 +42,7 @@ def process_bonus_results(results: list[dict]) -> pd.DataFrame:
 def initialize_eval_interface(example: dict, part_outputs: list[dict], input_vars: list[str]):
     """Initialize the interface with example text."""
     try:
-        html_content = create_bonus_html(example)
         # Create confidence plot data
         plot_data = create_bonus_confidence_plot(example["parts"], part_outputs)
@@ -221,7 +221,7 @@ class BonusInterface:
             return (
                 html_content,
                 gr.update(value=output_state),
-                gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}", visible=True),
                 gr.update(value=step_outputs, label=f"Step Outputs for Question {question_id + 1}", visible=True),
                 gr.update(visible=False),
             )
@@ -248,17 +248,18 @@ class BonusInterface:
             model_outputs = run_and_eval_bonus_dataset(
                 agent, self.ds, num_workers=2, return_extras=True, tqdm_provider=progress.tqdm
             )
-            n_parts_correct = 0
             total_parts = 0
-            n_questions_correct = 0
-            for model_output in model_outputs:
                 part_outputs = model_output["part_outputs"]
-                n_parts_correct += sum(output["correct"] for output in part_outputs)
                 total_parts += len(part_outputs)
-                n_questions_correct += int(n_parts_correct == len(part_outputs))
-            p_accuracy = n_parts_correct / total_parts
-            q_accuracy = n_questions_correct / len(self.ds)
             df = pd.DataFrame(
                 [
                     {
@@ -271,7 +272,7 @@ class BonusInterface:
             # plot_data = create_scatter_pyplot(part_numbers, part_scores)
             return (
-                gr.update(value=df, label="Scores on Sample Set"),
                 gr.update(visible=False),
                 gr.update(visible=False),
             )

 def initialize_eval_interface(example: dict, part_outputs: list[dict], input_vars: list[str]):
     """Initialize the interface with example text."""
     try:
+        html_content = create_bonus_html(example, part_outputs)
         # Create confidence plot data
         plot_data = create_bonus_confidence_plot(example["parts"], part_outputs)
             return (
                 html_content,
                 gr.update(value=output_state),
+                gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}", visible=False),
                 gr.update(value=step_outputs, label=f"Step Outputs for Question {question_id + 1}", visible=True),
                 gr.update(visible=False),
             )
             model_outputs = run_and_eval_bonus_dataset(
                 agent, self.ds, num_workers=2, return_extras=True, tqdm_provider=progress.tqdm
             )
+            total_parts_correct = 0
             total_parts = 0
+            total_questions_correct = 0
+            for i, model_output in enumerate(model_outputs):
                 part_outputs = model_output["part_outputs"]
+                n_parts_correct = sum(output["correct"] for output in part_outputs)
+                total_parts_correct += n_parts_correct
                 total_parts += len(part_outputs)
+                total_questions_correct += int(n_parts_correct == len(part_outputs))
+            p_accuracy = total_parts_correct / total_parts
+            q_accuracy = total_questions_correct / len(self.ds)
             df = pd.DataFrame(
                 [
                     {
             # plot_data = create_scatter_pyplot(part_numbers, part_scores)
             return (
+                gr.update(value=df, label="Scores on Sample Set", visible=True),
                 gr.update(visible=False),
                 gr.update(visible=False),
             )

src/components/quizbowl/plotting.py CHANGED Viewed

@@ -32,6 +32,34 @@ def _make_answer_line_html(answer_line: str) -> str:
     """
 def _get_token_classes(confidence, buzz, score) -> str:
     if confidence is None:
         return "token"
@@ -127,7 +155,7 @@ def create_tossup_html(
         return f"<div class='token-container'>Error creating tokens: {str(e)}</div>"
-def create_bonus_html(example: dict) -> str:
     # Create HTML for leadin and parts with answers
     leadin_html = f"<div class='bonus-leadin'>{example['leadin']}</div>"
     parts_html = []
@@ -139,11 +167,16 @@ def create_bonus_html(example: dict) -> str:
         else:
             answer_html = _make_answer_html(part["answer_primary"], part["clean_answers"])
         "<div class='bonus-part-number'>Part {i + 1}</div>"
         part_html = f"""
                 <div class='bonus-part'>
                     <div class='bonus-part-text'><b>#{i + 1}.</b> {question_text}</div>
                     {answer_html}
                 </div>
             """
         parts_html.append(part_html)

     """
+def _make_model_response_html(part_output: dict, explanation_token_limit: int = 25) -> str:
+    guess = part_output.get("guess", "")
+    confidence = float(part_output.get("confidence", 0.0))
+    explanation = part_output.get("explanation", "")
+    expl_tokens = explanation.split()
+    if len(expl_tokens) > explanation_token_limit:
+        k = len(expl_tokens) - explanation_token_limit
+        explanation = " ".join(expl_tokens[:explanation_token_limit]) + f"...[{k} more words]"
+    correct = part_output.get("correct", 0)
+    emoji = "✅" if correct else "❌"
+    answer_class = "correct-answer" if correct else "incorrect-answer"
+    return f"""
+        <div class='bonus-answer {answer_class}'>
+            <div class="bonus-answer-row" style="margin-bottom: 4px;">
+                <span class='bonus-answer-label' style='font-size: 1.2em;'>🤖 Guess: </span>
+                <span class='bonus-model-guess'>{guess} {emoji}</span>
+                <span class='confidence-badge' style='float: right'>⚡️ Confidence: {confidence:.2f}</span>
+            </div>
+            <div class='bonus-explanation'>
+                <span class='bonus-answer-label'>💬 Explanation:</span>
+                <span class='bonus-explanation-text' style='font-style: italic;'>{explanation}</span>
+            </div>
+        </div>
+    """
 def _get_token_classes(confidence, buzz, score) -> str:
     if confidence is None:
         return "token"
         return f"<div class='token-container'>Error creating tokens: {str(e)}</div>"
+def create_bonus_html(example: dict, part_outputs: list[dict] | None = None) -> str:
     # Create HTML for leadin and parts with answers
     leadin_html = f"<div class='bonus-leadin'>{example['leadin']}</div>"
     parts_html = []
         else:
             answer_html = _make_answer_html(part["answer_primary"], part["clean_answers"])
+        model_response_html = ""
+        if part_outputs is not None:
+            model_response_html = _make_model_response_html(part_outputs[i])
         "<div class='bonus-part-number'>Part {i + 1}</div>"
         part_html = f"""
                 <div class='bonus-part'>
                     <div class='bonus-part-text'><b>#{i + 1}.</b> {question_text}</div>
                     {answer_html}
+                    {model_response_html}
                 </div>
             """
         parts_html.append(part_html)

src/display/css_html_js.py CHANGED Viewed

@@ -132,6 +132,17 @@ fonts_header = """
 js_head = """
 <script>
     const gradioApp = document.getElementsByTagName('gradio-app')[0];
     console.log("Gradio app:", gradioApp);
     console.log(gradioApp.querySelectorAll('.token'));

 js_head = """
 <script>
+    function refresh() {
+        const url = new URL(window.location);
+        console.log("URL:", url);
+        const theme = url.searchParams.get('__theme');
+        console.log("Theme:", theme);
+        if (!theme || theme === 'dark') {
+            url.searchParams.set('__theme', 'light');
+            console.log("Setting theme to light");
+        }
+        window.location.href = url.href;
+    }
     const gradioApp = document.getElementsByTagName('gradio-app')[0];
     console.log("Gradio app:", gradioApp);
     console.log(gradioApp.querySelectorAll('.token'));

src/display/custom_css.py CHANGED Viewed

@@ -68,6 +68,21 @@ input[type=range][disabled] {
     opacity: .3;
 }
 .json-node {
     /* On a light background (usually white), use darker and vivid colors */
     font-size: var(--text-sm) !important;
@@ -582,20 +597,54 @@ css_bonus = """
 }
 .bonus-answer {
-    background-color: #fff5f5;
     border-radius: 6px;
     padding: 8px 12px;
     margin-top: 8px;
     font-size: 14px;
-    border-left: 3px solid #ff6b6b;
 }
 .bonus-answer-label {
     font-weight: 500;
-    color: #666;
     margin-bottom: 4px;
 }
 .bonus-answer-text {
     color: #333;
 }

     opacity: .3;
 }
+b, i, u, em, strong {
+    color: inherit !important;
+    /* Then override specific properties you want to keep */
+}
+b {font-weight: bold !important;}
+i {font-style: italic !important;}
+u {text-decoration: underline !important;}
+em {font-style: italic !important;}
+strong {font-weight: bold !important;}
 .json-node {
     /* On a light background (usually white), use darker and vivid colors */
     font-size: var(--text-sm) !important;
 }
 .bonus-answer {
+    background-color: #bde1ff;
     border-radius: 6px;
     padding: 8px 12px;
     margin-top: 8px;
     font-size: 14px;
+    border-left: 3px solid #133cba !important;
+}
+.confidence-badge {
+    font-size: 0.9em;
+    background-color: #2c3e50;
+    color: #fff;
+    padding: 2px 8px;
+    border-radius: 12px;
+    margin-left: 8px;
+}
+.correct-answer {
+    background-color: #b3f2ce !important;
+    border-left: 3px solid #228b22 !important;
+}
+.incorrect-answer {
+    background-color: #ffd1c9 !important;
+    border-left: 3px solid #ff4444 !important;
 }
 .bonus-answer-label {
     font-weight: 500;
+    color: #133cba;  /* Royal Blue */
     margin-bottom: 4px;
 }
+.bonus-model-guess {
+    border: 2px dotted #666 !important;
+    padding: 4px 8px !important;
+    border-radius: 4px !important;
+    font-weight: bold !important;
+}
+.correct-label {
+    color: #228b22;
+}
+.incorrect-label {
+    color: #ff4444;
+}
 .bonus-answer-text {
     color: #333;
 }

src/envs.py CHANGED Viewed

@@ -32,10 +32,7 @@ REGISTRATION_URL = "https://huggingface.co/spaces/qanta-challenge/register"
 LEADERBOARD_URL = "https://huggingface.co/spaces/qanta-challenge/leaderboard"
 EXAMPLES_PATH = "examples"
-PLAYGROUND_DATASET_NAMES = {
-    "tossup": f"{OWNER}/acf-co24-tossups",
-    "bonus": f"{OWNER}/acf-co24-bonuses",
-}
 # ----------------------------------

 LEADERBOARD_URL = "https://huggingface.co/spaces/qanta-challenge/leaderboard"
 EXAMPLES_PATH = "examples"
+PLAYGROUND_DATASET = f"{OWNER}/acf-co24"
 # ----------------------------------