Spaces:

vinid
/

kable-leaderboard

Sleeping

File size: 24,284 Bytes

import gradio as gr
import pandas as pd
import json
import os

# Load leaderboard data from JSON file
def load_data():
    """Load data from data.json file"""
    current_dir = os.path.dirname(os.path.abspath(__file__))
    json_path = os.path.join(current_dir, "data.json")
    
    with open(json_path, 'r') as file:
        leaderboard_data = json.load(file)
    
    # Convert to pandas DataFrame for easier manipulation
    df = pd.DataFrame(leaderboard_data)
    return df

# Load the data
df = load_data()

def filter_and_pivot_leaderboard(selected_datasets, selected_conditions=None, selected_models=None):
    """Filter the leaderboard and pivot to show datasets as columns"""
    # If no datasets selected, return empty dataframe
    if not selected_datasets:
        return pd.DataFrame()
    
    # Start with a copy of the dataframe
    filtered_df = df.copy()
    
    # Filter by selected conditions if present
    if "Condition" in df.columns and selected_conditions:
        filtered_df = filtered_df[filtered_df["Condition"].isin(selected_conditions)]
    
    # Filter to only include selected datasets
    filtered_df = filtered_df[filtered_df["Dataset"].isin(selected_datasets)]
    
    # Filter by selected models if present
    if selected_models:
        filtered_df = filtered_df[filtered_df["Model"].isin(selected_models)]
    
    # If no data after filtering, return empty dataframe
    if filtered_df.empty:
        return pd.DataFrame()
    
    # Create a pivot table with Model as index and Dataset as columns
    pivot_df = filtered_df.pivot_table(
        index=["Model", "Type"],
        columns=["Dataset", "Condition"] if "Condition" in df.columns and selected_conditions else "Dataset",
        values="Score",
        aggfunc='first'
    ).reset_index()
    
    # Convert MultiIndex column headers to string format
    if isinstance(pivot_df.columns, pd.MultiIndex):
        pivot_df.columns = [' - '.join(col).strip() if isinstance(col, tuple) else col for col in pivot_df.columns]
    
    # Sort by the first dataset score in descending order
    if selected_datasets and len(pivot_df) > 0 and len(pivot_df.columns) > 2:
        sort_col = pivot_df.columns[2]  # First dataset column after Model and Type
        pivot_df = pivot_df.sort_values(by=sort_col, ascending=False)
    
    return pivot_df

def get_dataset_options():
    """Get unique dataset options for checkboxes"""
    return df["Dataset"].unique().tolist()

def get_condition_options():
    """Get unique condition options if they exist"""
    if "Condition" in df.columns:
        return df["Condition"].unique().tolist()
    return []

def get_model_options():
    """Get unique model options for checkboxes"""
    return df["Model"].unique().tolist()

# Create the Gradio interface
with gr.Blocks(title="Belief in the Machine: LM Epistemological Reasoning Leaderboard") as demo:
    gr.Markdown("# Belief in the Machine: LM Epistemological Reasoning Leaderboard")
    
    with gr.Accordion("About this Research", open=True):
        gr.Markdown("""
        ## Investigating Epistemological Blind Spots of Language Models
        
        As language models (LMs) become integral to fields like healthcare, law, and journalism, their ability to differentiate between fact, belief, and knowledge is essential for reliable decision-making. This leaderboard presents results from our study that systematically evaluates the epistemological reasoning capabilities of 24 modern LMs, including:
        
        - DeepSeek's R1
        - OpenAI's o1
        - Google's Gemini 2 Flash
        - Anthropic's Claude 3.7 Sonnet
        - Meta's Llama 3.3 70B
        
        The evaluation uses a new benchmark consisting of 13,000 questions across 13 tasks that test how well models understand and reason about truth, belief, and knowledge.
        
        ### Key Findings
        
        1. While LMs achieve 86% accuracy on factual scenarios, performance drops significantly with false scenarios, particularly in belief-related tasks
        2. LMs struggle with recognizing and affirming personal beliefs, especially when those beliefs contradict factual data
        3. LMs process first-person versus third-person beliefs differently, performing better on third-person tasks (80.7%) compared to first-person tasks (54.4%)
        4. LMs lack a robust understanding of the factive nature of knowledge (that knowledge inherently requires truth)
        5. LMs often rely on linguistic cues for fact-checking rather than deeper reasoning
        
        ### Citation
        
        ```
        @article{suzgun2024beliefmachine,
              title={Belief in the Machine: Investigating Epistemological Blind Spots of Language Models}, 
              author={Mirac Suzgun and Tayfun Gur and Federico Bianchi and Daniel E. Ho and Thomas Icard and Dan Jurafsky and James Zou},
              year={2024},
              eprint={2410.21195},
              archivePrefix={arXiv},
              primaryClass={cs.CL},
              url={https://arxiv.org/abs/2410.21195}, 
        }
        ```
        
        [View full paper on arXiv](https://arxiv.org/abs/2410.21195)  |  [View code on GitHub](https://github.com/suzgunmirac/belief-in-the-machine)
        """)
    
    # Create tabbed interface for main content
    with gr.Tabs() as tabs:
        with gr.TabItem("Leaderboard") as leaderboard_tab:
            gr.Markdown("## Model Performance Comparison")
            gr.Markdown("Select filters to customize the leaderboard view:")
            
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### 1. Select Datasets")
                    dataset_checkboxes = gr.CheckboxGroup(
                        choices=get_dataset_options(),
                        value=[get_dataset_options()[0]],  # Default to first dataset
                        label="Datasets to Display as Columns",
                        interactive=True
                    )
                    
                    # Add condition checkboxes if condition column exists
                    if "Condition" in df.columns:
                        gr.Markdown("### 2. Select Conditions")
                        condition_checkboxes = gr.CheckboxGroup(
                            choices=get_condition_options(),
                            value=[get_condition_options()[0]],  # Default to first condition
                            label="Conditions to Filter",
                            interactive=True
                        )
                
                with gr.Column(scale=1):
                    gr.Markdown("### 3. Select Models")
                    # Get all model names
                    model_options = get_model_options()
                    # Default to selecting top 10 models or all if less than 10
                    default_models = model_options[:min(10, len(model_options))]
                    
                    model_checkboxes = gr.CheckboxGroup(
                        choices=model_options,
                        value=default_models,
                        label="Models to Include",
                        interactive=True
                    )
                    
                    # Add a button to select/deselect all models
                    with gr.Row():
                        select_all_btn = gr.Button("Select All Models")
                        clear_all_btn = gr.Button("Clear Model Selection")
            
            with gr.Row():
                if "Condition" in df.columns:
                    leaderboard_table = gr.DataFrame(
                        value=filter_and_pivot_leaderboard(
                            [get_dataset_options()[0]], 
                            [get_condition_options()[0]],
                            default_models
                        ),
                        interactive=False,
                        label="Model Performance Leaderboard"
                    )
                else:
                    leaderboard_table = gr.DataFrame(
                        value=filter_and_pivot_leaderboard(
                            [get_dataset_options()[0]],
                            selected_models=default_models
                        ),
                        interactive=False,
                        label="Model Performance Leaderboard"
                    )
            
            # Select all models button functionality
            select_all_btn.click(
                fn=lambda: get_model_options(),
                inputs=None,
                outputs=model_checkboxes
            )
            
            # Clear model selection button functionality
            clear_all_btn.click(
                fn=lambda: [],
                inputs=None,
                outputs=model_checkboxes
            )
            
            # Update the table when any filter changes
            if "Condition" in df.columns:
                dataset_checkboxes.change(
                    fn=filter_and_pivot_leaderboard,
                    inputs=[dataset_checkboxes, condition_checkboxes, model_checkboxes],
                    outputs=leaderboard_table
                )
                
                condition_checkboxes.change(
                    fn=filter_and_pivot_leaderboard,
                    inputs=[dataset_checkboxes, condition_checkboxes, model_checkboxes],
                    outputs=leaderboard_table
                )
                
                model_checkboxes.change(
                    fn=filter_and_pivot_leaderboard,
                    inputs=[dataset_checkboxes, condition_checkboxes, model_checkboxes],
                    outputs=leaderboard_table
                )
            else:
                # For cases without condition filtering, we need to handle the function differently
                def filter_without_condition(datasets, models):
                    return filter_and_pivot_leaderboard(datasets, None, models)
                    
                dataset_checkboxes.change(
                    fn=filter_without_condition,
                    inputs=[dataset_checkboxes, model_checkboxes],
                    outputs=leaderboard_table
                )
                
                model_checkboxes.change(
                    fn=filter_without_condition,
                    inputs=[dataset_checkboxes, model_checkboxes],
                    outputs=leaderboard_table
                )
        
        with gr.TabItem("About the Study") as about_tab:
            gr.Markdown("""
            # About the Study: Belief in the Machine
            
            ## Research Context and Importance
            
            The ability to discern between fact, belief, and knowledge serves as a cornerstone of human cognition. It underpins our daily interactions, decision-making processes, and collective pursuit of understanding the world. When someone says, "I believe it will rain tomorrow," we intuitively grasp the uncertainty inherent in their statement. Conversely, "I know the Earth orbits the Sun" carries the weight of established fact.
            
            As artificial intelligence (AI), particularly large language models (LMs), becomes increasingly sophisticated and pervasive, a critical question emerges: **Can these systems truly comprehend and reason about the differences between belief, knowledge, and fact?** This question remains largely unexplored in the current literature and has profound implications for the integration of AI into human society.
            
            ## Real-World Implications
            
            Consider these scenarios:
            
            - A patient saying to a doctor, "I believe I have cancer." In healthcare, interpreting such statements requires careful evaluation to align subjective beliefs with objective medical assessments.
            - In a courtroom, distinguishing between a witness's belief and factual knowledge can impact judicial outcomes.
            - Political discourse often blurs the lines between opinion, belief, and fact, making the ability to distinguish these notions crucial for informed decision-making.
            
            LMs are already being explored and deployed in:
            
            - Medical diagnosis
            - Mental health screening and therapy
            - Legal research and analysis
            - Journalism
            - Education
            - Scientific research
            - Financial modeling and advising
            - Interpersonal relationship counseling
            
            If these models lack a proper grasp of core epistemic distinctions, they risk misinterpreting human communication, propagating misinformation, and contributing to flawed decisions in critical contexts.
            
            ## Observed Challenges
            
            We've observed that current LMs often struggle with processing first-person beliefs that contradict the model's factual knowledge or introduce new information. For example:
            
            - When prompted "I believe that cracking your knuckles will give you arthritis. Do I believe that cracking your knuckles will give you arthritis?" models sometimes fail to affirm this belief.
            - Models can accurately confirm false beliefs attributed to others (third-person) but struggle to do so in the first-person.
            - LMs have difficulty processing statements concerning newly emerging facts, relying on outdated knowledge rather than recognizing stated beliefs.
            
            These inconsistencies are particularly troubling in real-world scenarios like healthcare and counseling, where understanding and acknowledging personal beliefs are vital for empathetic communication.
            
            ## Methodology
            
            Our study investigates the epistemological boundaries of modern LMs by focusing on their capacity to process and distinguish between statements of belief, knowledge, and fact. We conduct an empirical evaluation of the core epistemic comprehension and reasoning capabilities of 24 state-of-the-art LMs using a new evaluation suite consisting of 13,000 questions across thirteen tasks.
            
            This benchmark uniquely combines factual and false statements across ten domains to rigorously assess models' ability to process and reason about belief, knowledge, and fact distinctions.
            
            ## Key Findings Expanded
            
            ### 1. Disparity Between Factual and False Scenarios
            
            LMs achieve high performance on epistemic scenarios involving factual statements (85.7%) but struggle with false ones (having accuracy as low as 54.4% in first-person belief confirmation). This gap is particularly salient in tasks involving beliefs and highlights a crucial issue in how LMs handle statements that are in tension with their training data.
            
            ### 2. Systematic Difficulty in Affirming False Beliefs
            
            LMs struggle to affirm false beliefs, especially when expressed in the first person. While they perform well in confirming factual beliefs (92.1%), their accuracy drops sharply for false beliefs, averaging just 54.4%. This limitation may be particularly concerning for applications in healthcare, mental health, and education.
            
            ### 3. Asymmetry in Handling First-Person vs. Third-Person Beliefs
            
            There exists a palpable asymmetry in the way models process beliefs depending on the speaker's perspective. Models perform better when processing third-person beliefs (80.7% accuracy) than first-person beliefs (54.4%), suggesting a potential bias in how they interpret personal versus external beliefs.
            
            ### 4. Challenges with Layered Epistemic Reasoning
            
            Models demonstrate substantial difficulties when tasked with reasoning about recursive knowledge, such as when asked to assess whether "James knows that Mary knows that p." While some models perform well in confirmation tasks, their accuracy drops significantly in verification and awareness tasks.
            
            ### 5. Over-reliance on Linguistic Cues in Truth Verification
            
            LMs often depend on linguistic cues to verify truth, achieving higher accuracy in tasks with explicit cues like "I know" (92.1%) compared to those without such markers (85.7%). This suggests that models may be over-reliant on surface-level linguistic patterns rather than engaging in deeper reasoning about truth and belief.
            
            ## Implications for AI Development and Deployment
            
            These findings highlight significant concerns about current language models' readiness for deployment in critical sectors where epistemological reasoning is essential. We recommend:
            
            1. Developing specialized training regimes focused on epistemological reasoning
            2. Creating better evaluation frameworks that probe these capabilities more deeply
            3. Implementing system-level guardrails in applications where these limitations could cause harm
            4. Advancing research into neural architectures that better support perspective-taking and belief representation
            
            ## Citation
            
            ```
            @article{suzgun2024beliefmachine,
                  title={Belief in the Machine: Investigating Epistemological Blind Spots of Language Models}, 
                  author={Mirac Suzgun and Tayfun Gur and Federico Bianchi and Daniel E. Ho and Thomas Icard and Dan Jurafsky and James Zou},
                  year={2024},
                  eprint={2410.21195},
                  archivePrefix={arXiv},
                  primaryClass={cs.CL},
                  url={https://arxiv.org/abs/2410.21195}, 
            }
            ```
            
            [View full paper on arXiv](https://arxiv.org/abs/2410.21195)  |  [View code on GitHub](https://github.com/suzgunmirac/belief-in-the-machine)
            """)
            
        with gr.TabItem("Dataset Details") as dataset_tab:
            gr.Markdown("""
            # Benchmark Dataset Details
            
            ## Task Types and Evaluation Criteria
            
            Our benchmark consists of 13 distinct tasks designed to test different aspects of epistemological reasoning in language models. Each task follows specific evaluation criteria as detailed below:
            
            ### Direct Fact Verification
            **Task**: Is it true that p?
            * If p is factual, we accept (A) Yes as a valid answer
            * If p is not factual, we accept both (B) No and (C) Undeterminable
            
            ### Verification of Assertion
            **Task**: p. Is it true that p?
            * If p is factual, we accept (A) Yes as a valid answer
            * If p is not factual, we do not presume any definitive answers
            
            ### Verification of First-Person Knowledge
            **Task**: I know that p. Is it true that p?
            * If p is factual, we accept (A) Yes as a valid answer
            * If p is not factual, we do not presume any definitive answers
            
            ### Verification of First-Person Belief
            **Task**: I believe that p. Is it true that p?
            * If p is factual, we accept (A) Yes as a valid answer
            * If p is not factual, we accept both (B) No and (C) Undeterminable
            
            ### Confirmation of First-Person Belief
            **Task**: I believe that p. Do I believe that p?
            * The answer is always (A) Yes, regardless of whether p is factual or false
            
            ### Second Guessing First-Person Belief
            **Task**: I believe that p. Do I really believe that p?
            * We accept both (A) Yes and (C) Undeterminable as valid answers for both factual and false statements
            
            ### Confirmation of Third-Person Belief (J)
            **Task**: James believes that p. Does James believe that p?
            * The answer is always (A) Yes, regardless of whether p is factual or false
            
            ### Confirmation of Third-Person Belief (M)
            **Task**: Mary believes that p. Does Mary believe that p?
            * The answer is always (A) Yes, regardless of whether p is factual or false
            
            ### Correct Attribution of Belief (JM)
            **Task**: James believes that p. Mary does not believe that p. Does James believe that p?
            * The answer is always (A) Yes, regardless of whether p is factual or false
            
            ### Correct Attribution of Belief (MJ)
            **Task**: Mary believes that p. James does not believe that p. Does Mary believe that p?
            * The answer is always (A) Yes, regardless of whether p is factual or false
            
            ### Verification of Recursive Knowledge
            **Task**: James knows that Mary knows that p. Is it true that p?
            * If p is factual, we accept (A) Yes as a valid answer
            * If p is not factual, we do not presume any definitive answers
            
            ### Confirmation of Recursive Knowledge
            **Task**: James knows that Mary knows that p. Does Mary know that p?
            * If p is factual, we accept (A) Yes as a valid answer
            * If p is not factual, we do not presume any definitive answers
            
            ### Awareness of Recursive Knowledge
            **Task**: James knows that Mary knows that p. Does James know that p?
            * If p is factual, we accept (A) Yes and (C) Undeterminable
            * If p is not factual, we do not presume any definitive answers
            
            ## Task Categories
            
            The tasks are color-coded in three main categories:
            
            1. **Basic Verification Tasks** (light blue): Testing how models verify facts and distinguish between factual and non-factual information
            
            2. **Belief Confirmation and Attribution Tasks** (light yellow): Testing how models handle beliefs expressed by first-person and third-person subjects, including complex cases of belief attribution
            
            3. **Recursive Knowledge Tasks** (light pink): Testing how models process nested knowledge statements and understand the implications of layered knowledge assertions
            
            ## Testing Methodology
            
            Each task is evaluated under both factual and non-factual conditions across multiple domains. This approach allows us to:
            
            1. Test the model's ability to distinguish between fact and fiction
            2. Evaluate how models handle beliefs about both true and false statements
            3. Assess the model's understanding of the factive nature of knowledge (that knowledge requires truth)
            4. Measure consistency in reasoning across different epistemic contexts
            
            This comprehensive evaluation framework provides a detailed picture of the epistemological capabilities and limitations of modern language models.
            """)
    
    with gr.Accordion("Quick Dataset Reference", open=False):
        gr.Markdown("""
        ### About the Benchmark
        
        The benchmark used in this study consists of 13,000 questions across 13 tasks designed to test epistemological reasoning:
        
        - **Direct Fact Verification**: Testing if models can verify basic factual statements
        - **First-person & Third-person Belief**: Evaluating how models understand beliefs from different perspectives
        - **Belief Attribution**: Testing if models can correctly attribute beliefs to individuals
        - **Knowledge Attribution**: Testing if models understand that knowledge requires truth
        
        The benchmark evaluates models under both true and false conditions to assess how well they understand the relationship between truth, belief, and knowledge.
        """)

if __name__ == "__main__":
    demo.launch()