vinid commited on
Commit
47673be
·
0 Parent(s):
Files changed (4) hide show
  1. .gitignore +3 -0
  2. app.py +433 -0
  3. data.json +3530 -0
  4. requirements.txt +4 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ notebook.ipynb
2
+ .ipynb_checkpoints/
3
+ .venv/
app.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import json
4
+ import os
5
+
6
+ # Load leaderboard data from JSON file
7
+ def load_data():
8
+ """Load data from data.json file"""
9
+ current_dir = os.path.dirname(os.path.abspath(__file__))
10
+ json_path = os.path.join(current_dir, "data.json")
11
+
12
+ with open(json_path, 'r') as file:
13
+ leaderboard_data = json.load(file)
14
+
15
+ # Convert to pandas DataFrame for easier manipulation
16
+ df = pd.DataFrame(leaderboard_data)
17
+ return df
18
+
19
+ # Load the data
20
+ df = load_data()
21
+
22
+ def filter_and_pivot_leaderboard(selected_datasets, selected_conditions=None, selected_models=None):
23
+ """Filter the leaderboard and pivot to show datasets as columns"""
24
+ # If no datasets selected, return empty dataframe
25
+ if not selected_datasets:
26
+ return pd.DataFrame()
27
+
28
+ # Start with a copy of the dataframe
29
+ filtered_df = df.copy()
30
+
31
+ # Filter by selected conditions if present
32
+ if "Condition" in df.columns and selected_conditions:
33
+ filtered_df = filtered_df[filtered_df["Condition"].isin(selected_conditions)]
34
+
35
+ # Filter to only include selected datasets
36
+ filtered_df = filtered_df[filtered_df["Dataset"].isin(selected_datasets)]
37
+
38
+ # Filter by selected models if present
39
+ if selected_models:
40
+ filtered_df = filtered_df[filtered_df["Model"].isin(selected_models)]
41
+
42
+ # If no data after filtering, return empty dataframe
43
+ if filtered_df.empty:
44
+ return pd.DataFrame()
45
+
46
+ # Create a pivot table with Model as index and Dataset as columns
47
+ pivot_df = filtered_df.pivot_table(
48
+ index=["Model", "Type"],
49
+ columns=["Dataset", "Condition"] if "Condition" in df.columns and selected_conditions else "Dataset",
50
+ values="Score",
51
+ aggfunc='first'
52
+ ).reset_index()
53
+
54
+ # Sort by the first dataset score in descending order
55
+ if selected_datasets and len(pivot_df) > 0 and len(pivot_df.columns) > 2:
56
+ sort_col = pivot_df.columns[2] # First dataset column after Model and Type
57
+ pivot_df = pivot_df.sort_values(by=sort_col, ascending=False)
58
+
59
+ return pivot_df
60
+
61
+ def get_dataset_options():
62
+ """Get unique dataset options for checkboxes"""
63
+ return df["Dataset"].unique().tolist()
64
+
65
+ def get_condition_options():
66
+ """Get unique condition options if they exist"""
67
+ if "Condition" in df.columns:
68
+ return df["Condition"].unique().tolist()
69
+ return []
70
+
71
+ def get_model_options():
72
+ """Get unique model options for checkboxes"""
73
+ return df["Model"].unique().tolist()
74
+
75
+ # Create the Gradio interface
76
+ with gr.Blocks(title="Belief in the Machine: LM Epistemological Reasoning Leaderboard") as demo:
77
+ gr.Markdown("# Belief in the Machine: LM Epistemological Reasoning Leaderboard")
78
+
79
+ with gr.Accordion("About this Research", open=True):
80
+ gr.Markdown("""
81
+ ## Investigating Epistemological Blind Spots of Language Models
82
+
83
+ As language models (LMs) become integral to fields like healthcare, law, and journalism, their ability to differentiate between fact, belief, and knowledge is essential for reliable decision-making. This leaderboard presents results from our study that systematically evaluates the epistemological reasoning capabilities of 24 modern LMs, including:
84
+
85
+ - DeepSeek's R1
86
+ - OpenAI's o1
87
+ - Google's Gemini 2 Flash
88
+ - Anthropic's Claude 3.7 Sonnet
89
+ - Meta's Llama 3.3 70B
90
+
91
+ The evaluation uses a new benchmark consisting of 13,000 questions across 13 tasks that test how well models understand and reason about truth, belief, and knowledge.
92
+
93
+ ### Key Findings
94
+
95
+ 1. While LMs achieve 86% accuracy on factual scenarios, performance drops significantly with false scenarios, particularly in belief-related tasks
96
+ 2. LMs struggle with recognizing and affirming personal beliefs, especially when those beliefs contradict factual data
97
+ 3. LMs process first-person versus third-person beliefs differently, performing better on third-person tasks (80.7%) compared to first-person tasks (54.4%)
98
+ 4. LMs lack a robust understanding of the factive nature of knowledge (that knowledge inherently requires truth)
99
+ 5. LMs often rely on linguistic cues for fact-checking rather than deeper reasoning
100
+
101
+ ### Citation
102
+
103
+ ```
104
+ @article{suzgun2024beliefmachine,
105
+ title={Belief in the Machine: Investigating Epistemological Blind Spots of Language Models},
106
+ author={Mirac Suzgun and Tayfun Gur and Federico Bianchi and Daniel E. Ho and Thomas Icard and Dan Jurafsky and James Zou},
107
+ year={2024},
108
+ eprint={2410.21195},
109
+ archivePrefix={arXiv},
110
+ primaryClass={cs.CL},
111
+ url={https://arxiv.org/abs/2410.21195},
112
+ }
113
+ ```
114
+
115
+ [View full paper on arXiv](https://arxiv.org/abs/2410.21195) | [View code on GitHub](https://github.com/suzgunmirac/belief-in-the-machine)
116
+ """)
117
+
118
+ # Create tabbed interface for main content
119
+ with gr.Tabs() as tabs:
120
+ with gr.TabItem("Leaderboard") as leaderboard_tab:
121
+ gr.Markdown("## Model Performance Comparison")
122
+ gr.Markdown("Select filters to customize the leaderboard view:")
123
+
124
+ with gr.Row():
125
+ with gr.Column(scale=1):
126
+ gr.Markdown("### 1. Select Datasets")
127
+ dataset_checkboxes = gr.CheckboxGroup(
128
+ choices=get_dataset_options(),
129
+ value=[get_dataset_options()[0]], # Default to first dataset
130
+ label="Datasets to Display as Columns",
131
+ interactive=True
132
+ )
133
+
134
+ # Add condition checkboxes if condition column exists
135
+ if "Condition" in df.columns:
136
+ gr.Markdown("### 2. Select Conditions")
137
+ condition_checkboxes = gr.CheckboxGroup(
138
+ choices=get_condition_options(),
139
+ value=[get_condition_options()[0]], # Default to first condition
140
+ label="Conditions to Filter",
141
+ interactive=True
142
+ )
143
+
144
+ with gr.Column(scale=1):
145
+ gr.Markdown("### 3. Select Models")
146
+ # Get all model names
147
+ model_options = get_model_options()
148
+ # Default to selecting top 10 models or all if less than 10
149
+ default_models = model_options[:min(10, len(model_options))]
150
+
151
+ model_checkboxes = gr.CheckboxGroup(
152
+ choices=model_options,
153
+ value=default_models,
154
+ label="Models to Include",
155
+ interactive=True
156
+ )
157
+
158
+ # Add a button to select/deselect all models
159
+ with gr.Row():
160
+ select_all_btn = gr.Button("Select All Models")
161
+ clear_all_btn = gr.Button("Clear Model Selection")
162
+
163
+ with gr.Row():
164
+ if "Condition" in df.columns:
165
+ leaderboard_table = gr.DataFrame(
166
+ value=filter_and_pivot_leaderboard(
167
+ [get_dataset_options()[0]],
168
+ [get_condition_options()[0]],
169
+ default_models
170
+ ),
171
+ interactive=False,
172
+ label="Model Performance Leaderboard"
173
+ )
174
+ else:
175
+ leaderboard_table = gr.DataFrame(
176
+ value=filter_and_pivot_leaderboard(
177
+ [get_dataset_options()[0]],
178
+ selected_models=default_models
179
+ ),
180
+ interactive=False,
181
+ label="Model Performance Leaderboard"
182
+ )
183
+
184
+ # Select all models button functionality
185
+ select_all_btn.click(
186
+ fn=lambda: get_model_options(),
187
+ inputs=None,
188
+ outputs=model_checkboxes
189
+ )
190
+
191
+ # Clear model selection button functionality
192
+ clear_all_btn.click(
193
+ fn=lambda: [],
194
+ inputs=None,
195
+ outputs=model_checkboxes
196
+ )
197
+
198
+ # Update the table when any filter changes
199
+ if "Condition" in df.columns:
200
+ dataset_checkboxes.change(
201
+ fn=filter_and_pivot_leaderboard,
202
+ inputs=[dataset_checkboxes, condition_checkboxes, model_checkboxes],
203
+ outputs=leaderboard_table
204
+ )
205
+
206
+ condition_checkboxes.change(
207
+ fn=filter_and_pivot_leaderboard,
208
+ inputs=[dataset_checkboxes, condition_checkboxes, model_checkboxes],
209
+ outputs=leaderboard_table
210
+ )
211
+
212
+ model_checkboxes.change(
213
+ fn=filter_and_pivot_leaderboard,
214
+ inputs=[dataset_checkboxes, condition_checkboxes, model_checkboxes],
215
+ outputs=leaderboard_table
216
+ )
217
+ else:
218
+ # For cases without condition filtering, we need to handle the function differently
219
+ def filter_without_condition(datasets, models):
220
+ return filter_and_pivot_leaderboard(datasets, None, models)
221
+
222
+ dataset_checkboxes.change(
223
+ fn=filter_without_condition,
224
+ inputs=[dataset_checkboxes, model_checkboxes],
225
+ outputs=leaderboard_table
226
+ )
227
+
228
+ model_checkboxes.change(
229
+ fn=filter_without_condition,
230
+ inputs=[dataset_checkboxes, model_checkboxes],
231
+ outputs=leaderboard_table
232
+ )
233
+
234
+ with gr.TabItem("About the Study") as about_tab:
235
+ gr.Markdown("""
236
+ # About the Study: Belief in the Machine
237
+
238
+ ## Research Context and Importance
239
+
240
+ The ability to discern between fact, belief, and knowledge serves as a cornerstone of human cognition. It underpins our daily interactions, decision-making processes, and collective pursuit of understanding the world. When someone says, "I believe it will rain tomorrow," we intuitively grasp the uncertainty inherent in their statement. Conversely, "I know the Earth orbits the Sun" carries the weight of established fact.
241
+
242
+ As artificial intelligence (AI), particularly large language models (LMs), becomes increasingly sophisticated and pervasive, a critical question emerges: **Can these systems truly comprehend and reason about the differences between belief, knowledge, and fact?** This question remains largely unexplored in the current literature and has profound implications for the integration of AI into human society.
243
+
244
+ ## Real-World Implications
245
+
246
+ Consider these scenarios:
247
+
248
+ - A patient saying to a doctor, "I believe I have cancer." In healthcare, interpreting such statements requires careful evaluation to align subjective beliefs with objective medical assessments.
249
+ - In a courtroom, distinguishing between a witness's belief and factual knowledge can impact judicial outcomes.
250
+ - Political discourse often blurs the lines between opinion, belief, and fact, making the ability to distinguish these notions crucial for informed decision-making.
251
+
252
+ LMs are already being explored and deployed in:
253
+
254
+ - Medical diagnosis
255
+ - Mental health screening and therapy
256
+ - Legal research and analysis
257
+ - Journalism
258
+ - Education
259
+ - Scientific research
260
+ - Financial modeling and advising
261
+ - Interpersonal relationship counseling
262
+
263
+ If these models lack a proper grasp of core epistemic distinctions, they risk misinterpreting human communication, propagating misinformation, and contributing to flawed decisions in critical contexts.
264
+
265
+ ## Observed Challenges
266
+
267
+ We've observed that current LMs often struggle with processing first-person beliefs that contradict the model's factual knowledge or introduce new information. For example:
268
+
269
+ - When prompted "I believe that cracking your knuckles will give you arthritis. Do I believe that cracking your knuckles will give you arthritis?" models sometimes fail to affirm this belief.
270
+ - Models can accurately confirm false beliefs attributed to others (third-person) but struggle to do so in the first-person.
271
+ - LMs have difficulty processing statements concerning newly emerging facts, relying on outdated knowledge rather than recognizing stated beliefs.
272
+
273
+ These inconsistencies are particularly troubling in real-world scenarios like healthcare and counseling, where understanding and acknowledging personal beliefs are vital for empathetic communication.
274
+
275
+ ## Methodology
276
+
277
+ Our study investigates the epistemological boundaries of modern LMs by focusing on their capacity to process and distinguish between statements of belief, knowledge, and fact. We conduct an empirical evaluation of the core epistemic comprehension and reasoning capabilities of 24 state-of-the-art LMs using a new evaluation suite consisting of 13,000 questions across thirteen tasks.
278
+
279
+ This benchmark uniquely combines factual and false statements across ten domains to rigorously assess models' ability to process and reason about belief, knowledge, and fact distinctions.
280
+
281
+ ## Key Findings Expanded
282
+
283
+ ### 1. Disparity Between Factual and False Scenarios
284
+
285
+ LMs achieve high performance on epistemic scenarios involving factual statements (85.7%) but struggle with false ones (having accuracy as low as 54.4% in first-person belief confirmation). This gap is particularly salient in tasks involving beliefs and highlights a crucial issue in how LMs handle statements that are in tension with their training data.
286
+
287
+ ### 2. Systematic Difficulty in Affirming False Beliefs
288
+
289
+ LMs struggle to affirm false beliefs, especially when expressed in the first person. While they perform well in confirming factual beliefs (92.1%), their accuracy drops sharply for false beliefs, averaging just 54.4%. This limitation may be particularly concerning for applications in healthcare, mental health, and education.
290
+
291
+ ### 3. Asymmetry in Handling First-Person vs. Third-Person Beliefs
292
+
293
+ There exists a palpable asymmetry in the way models process beliefs depending on the speaker's perspective. Models perform better when processing third-person beliefs (80.7% accuracy) than first-person beliefs (54.4%), suggesting a potential bias in how they interpret personal versus external beliefs.
294
+
295
+ ### 4. Challenges with Layered Epistemic Reasoning
296
+
297
+ Models demonstrate substantial difficulties when tasked with reasoning about recursive knowledge, such as when asked to assess whether "James knows that Mary knows that p." While some models perform well in confirmation tasks, their accuracy drops significantly in verification and awareness tasks.
298
+
299
+ ### 5. Over-reliance on Linguistic Cues in Truth Verification
300
+
301
+ LMs often depend on linguistic cues to verify truth, achieving higher accuracy in tasks with explicit cues like "I know" (92.1%) compared to those without such markers (85.7%). This suggests that models may be over-reliant on surface-level linguistic patterns rather than engaging in deeper reasoning about truth and belief.
302
+
303
+ ## Implications for AI Development and Deployment
304
+
305
+ These findings highlight significant concerns about current language models' readiness for deployment in critical sectors where epistemological reasoning is essential. We recommend:
306
+
307
+ 1. Developing specialized training regimes focused on epistemological reasoning
308
+ 2. Creating better evaluation frameworks that probe these capabilities more deeply
309
+ 3. Implementing system-level guardrails in applications where these limitations could cause harm
310
+ 4. Advancing research into neural architectures that better support perspective-taking and belief representation
311
+
312
+ ## Citation
313
+
314
+ ```
315
+ @article{suzgun2024beliefmachine,
316
+ title={Belief in the Machine: Investigating Epistemological Blind Spots of Language Models},
317
+ author={Mirac Suzgun and Tayfun Gur and Federico Bianchi and Daniel E. Ho and Thomas Icard and Dan Jurafsky and James Zou},
318
+ year={2024},
319
+ eprint={2410.21195},
320
+ archivePrefix={arXiv},
321
+ primaryClass={cs.CL},
322
+ url={https://arxiv.org/abs/2410.21195},
323
+ }
324
+ ```
325
+
326
+ [View full paper on arXiv](https://arxiv.org/abs/2410.21195) | [View code on GitHub](https://github.com/suzgunmirac/belief-in-the-machine)
327
+ """)
328
+
329
+ with gr.TabItem("Dataset Details") as dataset_tab:
330
+ gr.Markdown("""
331
+ # Benchmark Dataset Details
332
+
333
+ ## Task Types and Evaluation Criteria
334
+
335
+ Our benchmark consists of 13 distinct tasks designed to test different aspects of epistemological reasoning in language models. Each task follows specific evaluation criteria as detailed below:
336
+
337
+ ### Direct Fact Verification
338
+ **Task**: Is it true that p?
339
+ * If p is factual, we accept (A) Yes as a valid answer
340
+ * If p is not factual, we accept both (B) No and (C) Undeterminable
341
+
342
+ ### Verification of Assertion
343
+ **Task**: p. Is it true that p?
344
+ * If p is factual, we accept (A) Yes as a valid answer
345
+ * If p is not factual, we do not presume any definitive answers
346
+
347
+ ### Verification of First-Person Knowledge
348
+ **Task**: I know that p. Is it true that p?
349
+ * If p is factual, we accept (A) Yes as a valid answer
350
+ * If p is not factual, we do not presume any definitive answers
351
+
352
+ ### Verification of First-Person Belief
353
+ **Task**: I believe that p. Is it true that p?
354
+ * If p is factual, we accept (A) Yes as a valid answer
355
+ * If p is not factual, we accept both (B) No and (C) Undeterminable
356
+
357
+ ### Confirmation of First-Person Belief
358
+ **Task**: I believe that p. Do I believe that p?
359
+ * The answer is always (A) Yes, regardless of whether p is factual or false
360
+
361
+ ### Second Guessing First-Person Belief
362
+ **Task**: I believe that p. Do I really believe that p?
363
+ * We accept both (A) Yes and (C) Undeterminable as valid answers for both factual and false statements
364
+
365
+ ### Confirmation of Third-Person Belief (J)
366
+ **Task**: James believes that p. Does James believe that p?
367
+ * The answer is always (A) Yes, regardless of whether p is factual or false
368
+
369
+ ### Confirmation of Third-Person Belief (M)
370
+ **Task**: Mary believes that p. Does Mary believe that p?
371
+ * The answer is always (A) Yes, regardless of whether p is factual or false
372
+
373
+ ### Correct Attribution of Belief (JM)
374
+ **Task**: James believes that p. Mary does not believe that p. Does James believe that p?
375
+ * The answer is always (A) Yes, regardless of whether p is factual or false
376
+
377
+ ### Correct Attribution of Belief (MJ)
378
+ **Task**: Mary believes that p. James does not believe that p. Does Mary believe that p?
379
+ * The answer is always (A) Yes, regardless of whether p is factual or false
380
+
381
+ ### Verification of Recursive Knowledge
382
+ **Task**: James knows that Mary knows that p. Is it true that p?
383
+ * If p is factual, we accept (A) Yes as a valid answer
384
+ * If p is not factual, we do not presume any definitive answers
385
+
386
+ ### Confirmation of Recursive Knowledge
387
+ **Task**: James knows that Mary knows that p. Does Mary know that p?
388
+ * If p is factual, we accept (A) Yes as a valid answer
389
+ * If p is not factual, we do not presume any definitive answers
390
+
391
+ ### Awareness of Recursive Knowledge
392
+ **Task**: James knows that Mary knows that p. Does James know that p?
393
+ * If p is factual, we accept (A) Yes and (C) Undeterminable
394
+ * If p is not factual, we do not presume any definitive answers
395
+
396
+ ## Task Categories
397
+
398
+ The tasks are color-coded in three main categories:
399
+
400
+ 1. **Basic Verification Tasks** (light blue): Testing how models verify facts and distinguish between factual and non-factual information
401
+
402
+ 2. **Belief Confirmation and Attribution Tasks** (light yellow): Testing how models handle beliefs expressed by first-person and third-person subjects, including complex cases of belief attribution
403
+
404
+ 3. **Recursive Knowledge Tasks** (light pink): Testing how models process nested knowledge statements and understand the implications of layered knowledge assertions
405
+
406
+ ## Testing Methodology
407
+
408
+ Each task is evaluated under both factual and non-factual conditions across multiple domains. This approach allows us to:
409
+
410
+ 1. Test the model's ability to distinguish between fact and fiction
411
+ 2. Evaluate how models handle beliefs about both true and false statements
412
+ 3. Assess the model's understanding of the factive nature of knowledge (that knowledge requires truth)
413
+ 4. Measure consistency in reasoning across different epistemic contexts
414
+
415
+ This comprehensive evaluation framework provides a detailed picture of the epistemological capabilities and limitations of modern language models.
416
+ """)
417
+
418
+ with gr.Accordion("Quick Dataset Reference", open=False):
419
+ gr.Markdown("""
420
+ ### About the Benchmark
421
+
422
+ The benchmark used in this study consists of 13,000 questions across 13 tasks designed to test epistemological reasoning:
423
+
424
+ - **Direct Fact Verification**: Testing if models can verify basic factual statements
425
+ - **First-person & Third-person Belief**: Evaluating how models understand beliefs from different perspectives
426
+ - **Belief Attribution**: Testing if models can correctly attribute beliefs to individuals
427
+ - **Knowledge Attribution**: Testing if models understand that knowledge requires truth
428
+
429
+ The benchmark evaluates models under both true and false conditions to assess how well they understand the relationship between truth, belief, and knowledge.
430
+ """)
431
+
432
+ if __name__ == "__main__":
433
+ demo.launch()
data.json ADDED
@@ -0,0 +1,3530 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "Model": "o1",
4
+ "Type": "Large Language Model",
5
+ "Dataset": "Direct Fact Ver.",
6
+ "Score": 94.4,
7
+ "Condition": "True"
8
+ },
9
+ {
10
+ "Model": "o3-mini",
11
+ "Type": "Large Language Model",
12
+ "Dataset": "Direct Fact Ver.",
13
+ "Score": 89.2,
14
+ "Condition": "True"
15
+ },
16
+ {
17
+ "Model": "4o",
18
+ "Type": "Large Language Model",
19
+ "Dataset": "Direct Fact Ver.",
20
+ "Score": 95.8,
21
+ "Condition": "True"
22
+ },
23
+ {
24
+ "Model": "3.7 Sonnet",
25
+ "Type": "Large Language Model",
26
+ "Dataset": "Direct Fact Ver.",
27
+ "Score": 90.6,
28
+ "Condition": "True"
29
+ },
30
+ {
31
+ "Model": "3.5 Sonnet",
32
+ "Type": "Large Language Model",
33
+ "Dataset": "Direct Fact Ver.",
34
+ "Score": 86.2,
35
+ "Condition": "True"
36
+ },
37
+ {
38
+ "Model": "Gemini 2 Flash",
39
+ "Type": "Large Language Model",
40
+ "Dataset": "Direct Fact Ver.",
41
+ "Score": 87.0,
42
+ "Condition": "True"
43
+ },
44
+ {
45
+ "Model": "Gemini 2 Flash-Lite",
46
+ "Type": "Large Language Model",
47
+ "Dataset": "Direct Fact Ver.",
48
+ "Score": 93.4,
49
+ "Condition": "True"
50
+ },
51
+ {
52
+ "Model": "R1",
53
+ "Type": "Large Language Model",
54
+ "Dataset": "Direct Fact Ver.",
55
+ "Score": 88.0,
56
+ "Condition": "True"
57
+ },
58
+ {
59
+ "Model": "R1-Dist.-Llama-70B",
60
+ "Type": "Large Language Model",
61
+ "Dataset": "Direct Fact Ver.",
62
+ "Score": 94.6,
63
+ "Condition": "True"
64
+ },
65
+ {
66
+ "Model": "R1-Dist.Qwen-14B",
67
+ "Type": "Large Language Model",
68
+ "Dataset": "Direct Fact Ver.",
69
+ "Score": 85.0,
70
+ "Condition": "True"
71
+ },
72
+ {
73
+ "Model": "Llama 3.3-70B Inst. Turbo",
74
+ "Type": "Large Language Model",
75
+ "Dataset": "Direct Fact Ver.",
76
+ "Score": 97.8,
77
+ "Condition": "True"
78
+ },
79
+ {
80
+ "Model": "GPT-4",
81
+ "Type": "Large Language Model",
82
+ "Dataset": "Direct Fact Ver.",
83
+ "Score": 90.6,
84
+ "Condition": "True"
85
+ },
86
+ {
87
+ "Model": "GPT-3.5",
88
+ "Type": "Large Language Model",
89
+ "Dataset": "Direct Fact Ver.",
90
+ "Score": 89.8,
91
+ "Condition": "True"
92
+ },
93
+ {
94
+ "Model": "3 Opus",
95
+ "Type": "Large Language Model",
96
+ "Dataset": "Direct Fact Ver.",
97
+ "Score": 85.0,
98
+ "Condition": "True"
99
+ },
100
+ {
101
+ "Model": "3 Sonnet",
102
+ "Type": "Large Language Model",
103
+ "Dataset": "Direct Fact Ver.",
104
+ "Score": 78.2,
105
+ "Condition": "True"
106
+ },
107
+ {
108
+ "Model": "3 Haiku",
109
+ "Type": "Large Language Model",
110
+ "Dataset": "Direct Fact Ver.",
111
+ "Score": 88.4,
112
+ "Condition": "True"
113
+ },
114
+ {
115
+ "Model": "Mixtral 8x22B",
116
+ "Type": "Large Language Model",
117
+ "Dataset": "Direct Fact Ver.",
118
+ "Score": 82.4,
119
+ "Condition": "True"
120
+ },
121
+ {
122
+ "Model": "Mixtral 8x7B",
123
+ "Type": "Large Language Model",
124
+ "Dataset": "Direct Fact Ver.",
125
+ "Score": 83.6,
126
+ "Condition": "True"
127
+ },
128
+ {
129
+ "Model": "Mixtral 7B",
130
+ "Type": "Large Language Model",
131
+ "Dataset": "Direct Fact Ver.",
132
+ "Score": 65.2,
133
+ "Condition": "True"
134
+ },
135
+ {
136
+ "Model": "Llama-3 70B",
137
+ "Type": "Large Language Model",
138
+ "Dataset": "Direct Fact Ver.",
139
+ "Score": 91.4,
140
+ "Condition": "True"
141
+ },
142
+ {
143
+ "Model": "Llama-3 8B",
144
+ "Type": "Large Language Model",
145
+ "Dataset": "Direct Fact Ver.",
146
+ "Score": 86.0,
147
+ "Condition": "True"
148
+ },
149
+ {
150
+ "Model": "Llama-2 70B",
151
+ "Type": "Large Language Model",
152
+ "Dataset": "Direct Fact Ver.",
153
+ "Score": 90.8,
154
+ "Condition": "True"
155
+ },
156
+ {
157
+ "Model": "Llama-2 13B",
158
+ "Type": "Large Language Model",
159
+ "Dataset": "Direct Fact Ver.",
160
+ "Score": 85.8,
161
+ "Condition": "True"
162
+ },
163
+ {
164
+ "Model": "Llama-2 7B",
165
+ "Type": "Large Language Model",
166
+ "Dataset": "Direct Fact Ver.",
167
+ "Score": 85.8,
168
+ "Condition": "True"
169
+ },
170
+ {
171
+ "Model": "o1",
172
+ "Type": "Large Language Model",
173
+ "Dataset": "Direct Fact Ver.",
174
+ "Score": 98.2,
175
+ "Condition": "False"
176
+ },
177
+ {
178
+ "Model": "o3-mini",
179
+ "Type": "Large Language Model",
180
+ "Dataset": "Direct Fact Ver.",
181
+ "Score": 96.2,
182
+ "Condition": "False"
183
+ },
184
+ {
185
+ "Model": "4o",
186
+ "Type": "Large Language Model",
187
+ "Dataset": "Direct Fact Ver.",
188
+ "Score": 91.4,
189
+ "Condition": "False"
190
+ },
191
+ {
192
+ "Model": "3.7 Sonnet",
193
+ "Type": "Large Language Model",
194
+ "Dataset": "Direct Fact Ver.",
195
+ "Score": 97.6,
196
+ "Condition": "False"
197
+ },
198
+ {
199
+ "Model": "3.5 Sonnet",
200
+ "Type": "Large Language Model",
201
+ "Dataset": "Direct Fact Ver.",
202
+ "Score": 96.8,
203
+ "Condition": "False"
204
+ },
205
+ {
206
+ "Model": "Gemini 2 Flash",
207
+ "Type": "Large Language Model",
208
+ "Dataset": "Direct Fact Ver.",
209
+ "Score": 91.0,
210
+ "Condition": "False"
211
+ },
212
+ {
213
+ "Model": "Gemini 2 Flash-Lite",
214
+ "Type": "Large Language Model",
215
+ "Dataset": "Direct Fact Ver.",
216
+ "Score": 81.2,
217
+ "Condition": "False"
218
+ },
219
+ {
220
+ "Model": "R1",
221
+ "Type": "Large Language Model",
222
+ "Dataset": "Direct Fact Ver.",
223
+ "Score": 97.0,
224
+ "Condition": "False"
225
+ },
226
+ {
227
+ "Model": "R1-Dist.-Llama-70B",
228
+ "Type": "Large Language Model",
229
+ "Dataset": "Direct Fact Ver.",
230
+ "Score": 88.0,
231
+ "Condition": "False"
232
+ },
233
+ {
234
+ "Model": "R1-Dist.Qwen-14B",
235
+ "Type": "Large Language Model",
236
+ "Dataset": "Direct Fact Ver.",
237
+ "Score": 89.6,
238
+ "Condition": "False"
239
+ },
240
+ {
241
+ "Model": "Llama 3.3-70B Inst. Turbo",
242
+ "Type": "Large Language Model",
243
+ "Dataset": "Direct Fact Ver.",
244
+ "Score": 80.0,
245
+ "Condition": "False"
246
+ },
247
+ {
248
+ "Model": "GPT-4",
249
+ "Type": "Large Language Model",
250
+ "Dataset": "Direct Fact Ver.",
251
+ "Score": 83.0,
252
+ "Condition": "False"
253
+ },
254
+ {
255
+ "Model": "GPT-3.5",
256
+ "Type": "Large Language Model",
257
+ "Dataset": "Direct Fact Ver.",
258
+ "Score": 49.4,
259
+ "Condition": "False"
260
+ },
261
+ {
262
+ "Model": "3 Opus",
263
+ "Type": "Large Language Model",
264
+ "Dataset": "Direct Fact Ver.",
265
+ "Score": 94.4,
266
+ "Condition": "False"
267
+ },
268
+ {
269
+ "Model": "3 Sonnet",
270
+ "Type": "Large Language Model",
271
+ "Dataset": "Direct Fact Ver.",
272
+ "Score": 87.6,
273
+ "Condition": "False"
274
+ },
275
+ {
276
+ "Model": "3 Haiku",
277
+ "Type": "Large Language Model",
278
+ "Dataset": "Direct Fact Ver.",
279
+ "Score": 69.4,
280
+ "Condition": "False"
281
+ },
282
+ {
283
+ "Model": "Mixtral 8x22B",
284
+ "Type": "Large Language Model",
285
+ "Dataset": "Direct Fact Ver.",
286
+ "Score": 78.6,
287
+ "Condition": "False"
288
+ },
289
+ {
290
+ "Model": "Mixtral 8x7B",
291
+ "Type": "Large Language Model",
292
+ "Dataset": "Direct Fact Ver.",
293
+ "Score": 60.0,
294
+ "Condition": "False"
295
+ },
296
+ {
297
+ "Model": "Mixtral 7B",
298
+ "Type": "Large Language Model",
299
+ "Dataset": "Direct Fact Ver.",
300
+ "Score": 51.6,
301
+ "Condition": "False"
302
+ },
303
+ {
304
+ "Model": "Llama-3 70B",
305
+ "Type": "Large Language Model",
306
+ "Dataset": "Direct Fact Ver.",
307
+ "Score": 79.8,
308
+ "Condition": "False"
309
+ },
310
+ {
311
+ "Model": "Llama-3 8B",
312
+ "Type": "Large Language Model",
313
+ "Dataset": "Direct Fact Ver.",
314
+ "Score": 65.6,
315
+ "Condition": "False"
316
+ },
317
+ {
318
+ "Model": "Llama-2 70B",
319
+ "Type": "Large Language Model",
320
+ "Dataset": "Direct Fact Ver.",
321
+ "Score": 80.0,
322
+ "Condition": "False"
323
+ },
324
+ {
325
+ "Model": "Llama-2 13B",
326
+ "Type": "Large Language Model",
327
+ "Dataset": "Direct Fact Ver.",
328
+ "Score": 65.8,
329
+ "Condition": "False"
330
+ },
331
+ {
332
+ "Model": "Llama-2 7B",
333
+ "Type": "Large Language Model",
334
+ "Dataset": "Direct Fact Ver.",
335
+ "Score": 64.8,
336
+ "Condition": "False"
337
+ },
338
+ {
339
+ "Model": "o1",
340
+ "Type": "Large Language Model",
341
+ "Dataset": "Ver. of Assertion",
342
+ "Score": 96.2,
343
+ "Condition": "True"
344
+ },
345
+ {
346
+ "Model": "o3-mini",
347
+ "Type": "Large Language Model",
348
+ "Dataset": "Ver. of Assertion",
349
+ "Score": 94.6,
350
+ "Condition": "True"
351
+ },
352
+ {
353
+ "Model": "4o",
354
+ "Type": "Large Language Model",
355
+ "Dataset": "Ver. of Assertion",
356
+ "Score": 97.4,
357
+ "Condition": "True"
358
+ },
359
+ {
360
+ "Model": "3.7 Sonnet",
361
+ "Type": "Large Language Model",
362
+ "Dataset": "Ver. of Assertion",
363
+ "Score": 94.4,
364
+ "Condition": "True"
365
+ },
366
+ {
367
+ "Model": "3.5 Sonnet",
368
+ "Type": "Large Language Model",
369
+ "Dataset": "Ver. of Assertion",
370
+ "Score": 93.0,
371
+ "Condition": "True"
372
+ },
373
+ {
374
+ "Model": "Gemini 2 Flash",
375
+ "Type": "Large Language Model",
376
+ "Dataset": "Ver. of Assertion",
377
+ "Score": 98.0,
378
+ "Condition": "True"
379
+ },
380
+ {
381
+ "Model": "Gemini 2 Flash-Lite",
382
+ "Type": "Large Language Model",
383
+ "Dataset": "Ver. of Assertion",
384
+ "Score": 97.2,
385
+ "Condition": "True"
386
+ },
387
+ {
388
+ "Model": "R1",
389
+ "Type": "Large Language Model",
390
+ "Dataset": "Ver. of Assertion",
391
+ "Score": 91.4,
392
+ "Condition": "True"
393
+ },
394
+ {
395
+ "Model": "R1-Dist.-Llama-70B",
396
+ "Type": "Large Language Model",
397
+ "Dataset": "Ver. of Assertion",
398
+ "Score": 95.4,
399
+ "Condition": "True"
400
+ },
401
+ {
402
+ "Model": "R1-Dist.Qwen-14B",
403
+ "Type": "Large Language Model",
404
+ "Dataset": "Ver. of Assertion",
405
+ "Score": 86.6,
406
+ "Condition": "True"
407
+ },
408
+ {
409
+ "Model": "Llama 3.3-70B Inst. Turbo",
410
+ "Type": "Large Language Model",
411
+ "Dataset": "Ver. of Assertion",
412
+ "Score": 97.2,
413
+ "Condition": "True"
414
+ },
415
+ {
416
+ "Model": "GPT-4",
417
+ "Type": "Large Language Model",
418
+ "Dataset": "Ver. of Assertion",
419
+ "Score": 91.4,
420
+ "Condition": "True"
421
+ },
422
+ {
423
+ "Model": "GPT-3.5",
424
+ "Type": "Large Language Model",
425
+ "Dataset": "Ver. of Assertion",
426
+ "Score": 95.0,
427
+ "Condition": "True"
428
+ },
429
+ {
430
+ "Model": "3 Opus",
431
+ "Type": "Large Language Model",
432
+ "Dataset": "Ver. of Assertion",
433
+ "Score": 91.6,
434
+ "Condition": "True"
435
+ },
436
+ {
437
+ "Model": "3 Sonnet",
438
+ "Type": "Large Language Model",
439
+ "Dataset": "Ver. of Assertion",
440
+ "Score": 90.2,
441
+ "Condition": "True"
442
+ },
443
+ {
444
+ "Model": "3 Haiku",
445
+ "Type": "Large Language Model",
446
+ "Dataset": "Ver. of Assertion",
447
+ "Score": 95.8,
448
+ "Condition": "True"
449
+ },
450
+ {
451
+ "Model": "Mixtral 8x22B",
452
+ "Type": "Large Language Model",
453
+ "Dataset": "Ver. of Assertion",
454
+ "Score": 89.0,
455
+ "Condition": "True"
456
+ },
457
+ {
458
+ "Model": "Mixtral 8x7B",
459
+ "Type": "Large Language Model",
460
+ "Dataset": "Ver. of Assertion",
461
+ "Score": 87.6,
462
+ "Condition": "True"
463
+ },
464
+ {
465
+ "Model": "Mixtral 7B",
466
+ "Type": "Large Language Model",
467
+ "Dataset": "Ver. of Assertion",
468
+ "Score": 89.8,
469
+ "Condition": "True"
470
+ },
471
+ {
472
+ "Model": "Llama-3 70B",
473
+ "Type": "Large Language Model",
474
+ "Dataset": "Ver. of Assertion",
475
+ "Score": 91.0,
476
+ "Condition": "True"
477
+ },
478
+ {
479
+ "Model": "Llama-3 8B",
480
+ "Type": "Large Language Model",
481
+ "Dataset": "Ver. of Assertion",
482
+ "Score": 89.2,
483
+ "Condition": "True"
484
+ },
485
+ {
486
+ "Model": "Llama-2 70B",
487
+ "Type": "Large Language Model",
488
+ "Dataset": "Ver. of Assertion",
489
+ "Score": 90.0,
490
+ "Condition": "True"
491
+ },
492
+ {
493
+ "Model": "Llama-2 13B",
494
+ "Type": "Large Language Model",
495
+ "Dataset": "Ver. of Assertion",
496
+ "Score": 89.0,
497
+ "Condition": "True"
498
+ },
499
+ {
500
+ "Model": "Llama-2 7B",
501
+ "Type": "Large Language Model",
502
+ "Dataset": "Ver. of Assertion",
503
+ "Score": 88.4,
504
+ "Condition": "True"
505
+ },
506
+ {
507
+ "Model": "o1",
508
+ "Type": "Large Language Model",
509
+ "Dataset": "Ver. of 1P Knowledge",
510
+ "Score": 95.4,
511
+ "Condition": "True"
512
+ },
513
+ {
514
+ "Model": "o3-mini",
515
+ "Type": "Large Language Model",
516
+ "Dataset": "Ver. of 1P Knowledge",
517
+ "Score": 93.8,
518
+ "Condition": "True"
519
+ },
520
+ {
521
+ "Model": "4o",
522
+ "Type": "Large Language Model",
523
+ "Dataset": "Ver. of 1P Knowledge",
524
+ "Score": 97.4,
525
+ "Condition": "True"
526
+ },
527
+ {
528
+ "Model": "3.7 Sonnet",
529
+ "Type": "Large Language Model",
530
+ "Dataset": "Ver. of 1P Knowledge",
531
+ "Score": 96.8,
532
+ "Condition": "True"
533
+ },
534
+ {
535
+ "Model": "3.5 Sonnet",
536
+ "Type": "Large Language Model",
537
+ "Dataset": "Ver. of 1P Knowledge",
538
+ "Score": 97.8,
539
+ "Condition": "True"
540
+ },
541
+ {
542
+ "Model": "Gemini 2 Flash",
543
+ "Type": "Large Language Model",
544
+ "Dataset": "Ver. of 1P Knowledge",
545
+ "Score": 98.8,
546
+ "Condition": "True"
547
+ },
548
+ {
549
+ "Model": "Gemini 2 Flash-Lite",
550
+ "Type": "Large Language Model",
551
+ "Dataset": "Ver. of 1P Knowledge",
552
+ "Score": 98.0,
553
+ "Condition": "True"
554
+ },
555
+ {
556
+ "Model": "R1",
557
+ "Type": "Large Language Model",
558
+ "Dataset": "Ver. of 1P Knowledge",
559
+ "Score": 90.0,
560
+ "Condition": "True"
561
+ },
562
+ {
563
+ "Model": "R1-Dist.-Llama-70B",
564
+ "Type": "Large Language Model",
565
+ "Dataset": "Ver. of 1P Knowledge",
566
+ "Score": 95.4,
567
+ "Condition": "True"
568
+ },
569
+ {
570
+ "Model": "R1-Dist.Qwen-14B",
571
+ "Type": "Large Language Model",
572
+ "Dataset": "Ver. of 1P Knowledge",
573
+ "Score": 87.2,
574
+ "Condition": "True"
575
+ },
576
+ {
577
+ "Model": "Llama 3.3-70B Inst. Turbo",
578
+ "Type": "Large Language Model",
579
+ "Dataset": "Ver. of 1P Knowledge",
580
+ "Score": 96.6,
581
+ "Condition": "True"
582
+ },
583
+ {
584
+ "Model": "GPT-4",
585
+ "Type": "Large Language Model",
586
+ "Dataset": "Ver. of 1P Knowledge",
587
+ "Score": 94.4,
588
+ "Condition": "True"
589
+ },
590
+ {
591
+ "Model": "GPT-3.5",
592
+ "Type": "Large Language Model",
593
+ "Dataset": "Ver. of 1P Knowledge",
594
+ "Score": 95.4,
595
+ "Condition": "True"
596
+ },
597
+ {
598
+ "Model": "3 Opus",
599
+ "Type": "Large Language Model",
600
+ "Dataset": "Ver. of 1P Knowledge",
601
+ "Score": 94.0,
602
+ "Condition": "True"
603
+ },
604
+ {
605
+ "Model": "3 Sonnet",
606
+ "Type": "Large Language Model",
607
+ "Dataset": "Ver. of 1P Knowledge",
608
+ "Score": 92.2,
609
+ "Condition": "True"
610
+ },
611
+ {
612
+ "Model": "3 Haiku",
613
+ "Type": "Large Language Model",
614
+ "Dataset": "Ver. of 1P Knowledge",
615
+ "Score": 95.4,
616
+ "Condition": "True"
617
+ },
618
+ {
619
+ "Model": "Mixtral 8x22B",
620
+ "Type": "Large Language Model",
621
+ "Dataset": "Ver. of 1P Knowledge",
622
+ "Score": 92.8,
623
+ "Condition": "True"
624
+ },
625
+ {
626
+ "Model": "Mixtral 8x7B",
627
+ "Type": "Large Language Model",
628
+ "Dataset": "Ver. of 1P Knowledge",
629
+ "Score": 92.4,
630
+ "Condition": "True"
631
+ },
632
+ {
633
+ "Model": "Mixtral 7B",
634
+ "Type": "Large Language Model",
635
+ "Dataset": "Ver. of 1P Knowledge",
636
+ "Score": 93.8,
637
+ "Condition": "True"
638
+ },
639
+ {
640
+ "Model": "Llama-3 70B",
641
+ "Type": "Large Language Model",
642
+ "Dataset": "Ver. of 1P Knowledge",
643
+ "Score": 89.6,
644
+ "Condition": "True"
645
+ },
646
+ {
647
+ "Model": "Llama-3 8B",
648
+ "Type": "Large Language Model",
649
+ "Dataset": "Ver. of 1P Knowledge",
650
+ "Score": 86.0,
651
+ "Condition": "True"
652
+ },
653
+ {
654
+ "Model": "Llama-2 70B",
655
+ "Type": "Large Language Model",
656
+ "Dataset": "Ver. of 1P Knowledge",
657
+ "Score": 89.0,
658
+ "Condition": "True"
659
+ },
660
+ {
661
+ "Model": "Llama-2 13B",
662
+ "Type": "Large Language Model",
663
+ "Dataset": "Ver. of 1P Knowledge",
664
+ "Score": 85.8,
665
+ "Condition": "True"
666
+ },
667
+ {
668
+ "Model": "Llama-2 7B",
669
+ "Type": "Large Language Model",
670
+ "Dataset": "Ver. of 1P Knowledge",
671
+ "Score": 85.6,
672
+ "Condition": "True"
673
+ },
674
+ {
675
+ "Model": "o1",
676
+ "Type": "Large Language Model",
677
+ "Dataset": "Ver. of 1P Belief",
678
+ "Score": 93.8,
679
+ "Condition": "True"
680
+ },
681
+ {
682
+ "Model": "o3-mini",
683
+ "Type": "Large Language Model",
684
+ "Dataset": "Ver. of 1P Belief",
685
+ "Score": 85.8,
686
+ "Condition": "True"
687
+ },
688
+ {
689
+ "Model": "4o",
690
+ "Type": "Large Language Model",
691
+ "Dataset": "Ver. of 1P Belief",
692
+ "Score": 94.0,
693
+ "Condition": "True"
694
+ },
695
+ {
696
+ "Model": "3.7 Sonnet",
697
+ "Type": "Large Language Model",
698
+ "Dataset": "Ver. of 1P Belief",
699
+ "Score": 86.4,
700
+ "Condition": "True"
701
+ },
702
+ {
703
+ "Model": "3.5 Sonnet",
704
+ "Type": "Large Language Model",
705
+ "Dataset": "Ver. of 1P Belief",
706
+ "Score": 83.8,
707
+ "Condition": "True"
708
+ },
709
+ {
710
+ "Model": "Gemini 2 Flash",
711
+ "Type": "Large Language Model",
712
+ "Dataset": "Ver. of 1P Belief",
713
+ "Score": 89.2,
714
+ "Condition": "True"
715
+ },
716
+ {
717
+ "Model": "Gemini 2 Flash-Lite",
718
+ "Type": "Large Language Model",
719
+ "Dataset": "Ver. of 1P Belief",
720
+ "Score": 90.6,
721
+ "Condition": "True"
722
+ },
723
+ {
724
+ "Model": "R1",
725
+ "Type": "Large Language Model",
726
+ "Dataset": "Ver. of 1P Belief",
727
+ "Score": 88.6,
728
+ "Condition": "True"
729
+ },
730
+ {
731
+ "Model": "R1-Dist.-Llama-70B",
732
+ "Type": "Large Language Model",
733
+ "Dataset": "Ver. of 1P Belief",
734
+ "Score": 93.6,
735
+ "Condition": "True"
736
+ },
737
+ {
738
+ "Model": "R1-Dist.Qwen-14B",
739
+ "Type": "Large Language Model",
740
+ "Dataset": "Ver. of 1P Belief",
741
+ "Score": 86.4,
742
+ "Condition": "True"
743
+ },
744
+ {
745
+ "Model": "Llama 3.3-70B Inst. Turbo",
746
+ "Type": "Large Language Model",
747
+ "Dataset": "Ver. of 1P Belief",
748
+ "Score": 95.0,
749
+ "Condition": "True"
750
+ },
751
+ {
752
+ "Model": "GPT-4",
753
+ "Type": "Large Language Model",
754
+ "Dataset": "Ver. of 1P Belief",
755
+ "Score": 90.2,
756
+ "Condition": "True"
757
+ },
758
+ {
759
+ "Model": "GPT-3.5",
760
+ "Type": "Large Language Model",
761
+ "Dataset": "Ver. of 1P Belief",
762
+ "Score": 89.8,
763
+ "Condition": "True"
764
+ },
765
+ {
766
+ "Model": "3 Opus",
767
+ "Type": "Large Language Model",
768
+ "Dataset": "Ver. of 1P Belief",
769
+ "Score": 80.2,
770
+ "Condition": "True"
771
+ },
772
+ {
773
+ "Model": "3 Sonnet",
774
+ "Type": "Large Language Model",
775
+ "Dataset": "Ver. of 1P Belief",
776
+ "Score": 74.8,
777
+ "Condition": "True"
778
+ },
779
+ {
780
+ "Model": "3 Haiku",
781
+ "Type": "Large Language Model",
782
+ "Dataset": "Ver. of 1P Belief",
783
+ "Score": 84.8,
784
+ "Condition": "True"
785
+ },
786
+ {
787
+ "Model": "Mixtral 8x22B",
788
+ "Type": "Large Language Model",
789
+ "Dataset": "Ver. of 1P Belief",
790
+ "Score": 81.4,
791
+ "Condition": "True"
792
+ },
793
+ {
794
+ "Model": "Mixtral 8x7B",
795
+ "Type": "Large Language Model",
796
+ "Dataset": "Ver. of 1P Belief",
797
+ "Score": 81.6,
798
+ "Condition": "True"
799
+ },
800
+ {
801
+ "Model": "Mixtral 7B",
802
+ "Type": "Large Language Model",
803
+ "Dataset": "Ver. of 1P Belief",
804
+ "Score": 83.8,
805
+ "Condition": "True"
806
+ },
807
+ {
808
+ "Model": "Llama-3 70B",
809
+ "Type": "Large Language Model",
810
+ "Dataset": "Ver. of 1P Belief",
811
+ "Score": 85.6,
812
+ "Condition": "True"
813
+ },
814
+ {
815
+ "Model": "Llama-3 8B",
816
+ "Type": "Large Language Model",
817
+ "Dataset": "Ver. of 1P Belief",
818
+ "Score": 79.8,
819
+ "Condition": "True"
820
+ },
821
+ {
822
+ "Model": "Llama-2 70B",
823
+ "Type": "Large Language Model",
824
+ "Dataset": "Ver. of 1P Belief",
825
+ "Score": 85.0,
826
+ "Condition": "True"
827
+ },
828
+ {
829
+ "Model": "Llama-2 13B",
830
+ "Type": "Large Language Model",
831
+ "Dataset": "Ver. of 1P Belief",
832
+ "Score": 80.8,
833
+ "Condition": "True"
834
+ },
835
+ {
836
+ "Model": "Llama-2 7B",
837
+ "Type": "Large Language Model",
838
+ "Dataset": "Ver. of 1P Belief",
839
+ "Score": 80.4,
840
+ "Condition": "True"
841
+ },
842
+ {
843
+ "Model": "o1",
844
+ "Type": "Large Language Model",
845
+ "Dataset": "Ver. of 1P Belief",
846
+ "Score": 97.6,
847
+ "Condition": "False"
848
+ },
849
+ {
850
+ "Model": "o3-mini",
851
+ "Type": "Large Language Model",
852
+ "Dataset": "Ver. of 1P Belief",
853
+ "Score": 96.2,
854
+ "Condition": "False"
855
+ },
856
+ {
857
+ "Model": "4o",
858
+ "Type": "Large Language Model",
859
+ "Dataset": "Ver. of 1P Belief",
860
+ "Score": 93.4,
861
+ "Condition": "False"
862
+ },
863
+ {
864
+ "Model": "3.7 Sonnet",
865
+ "Type": "Large Language Model",
866
+ "Dataset": "Ver. of 1P Belief",
867
+ "Score": 98.2,
868
+ "Condition": "False"
869
+ },
870
+ {
871
+ "Model": "3.5 Sonnet",
872
+ "Type": "Large Language Model",
873
+ "Dataset": "Ver. of 1P Belief",
874
+ "Score": 97.0,
875
+ "Condition": "False"
876
+ },
877
+ {
878
+ "Model": "Gemini 2 Flash",
879
+ "Type": "Large Language Model",
880
+ "Dataset": "Ver. of 1P Belief",
881
+ "Score": 88.2,
882
+ "Condition": "False"
883
+ },
884
+ {
885
+ "Model": "Gemini 2 Flash-Lite",
886
+ "Type": "Large Language Model",
887
+ "Dataset": "Ver. of 1P Belief",
888
+ "Score": 82.4,
889
+ "Condition": "False"
890
+ },
891
+ {
892
+ "Model": "R1",
893
+ "Type": "Large Language Model",
894
+ "Dataset": "Ver. of 1P Belief",
895
+ "Score": 96.6,
896
+ "Condition": "False"
897
+ },
898
+ {
899
+ "Model": "R1-Dist.-Llama-70B",
900
+ "Type": "Large Language Model",
901
+ "Dataset": "Ver. of 1P Belief",
902
+ "Score": 92.0,
903
+ "Condition": "False"
904
+ },
905
+ {
906
+ "Model": "R1-Dist.Qwen-14B",
907
+ "Type": "Large Language Model",
908
+ "Dataset": "Ver. of 1P Belief",
909
+ "Score": 89.4,
910
+ "Condition": "False"
911
+ },
912
+ {
913
+ "Model": "Llama 3.3-70B Inst. Turbo",
914
+ "Type": "Large Language Model",
915
+ "Dataset": "Ver. of 1P Belief",
916
+ "Score": 88.2,
917
+ "Condition": "False"
918
+ },
919
+ {
920
+ "Model": "GPT-4",
921
+ "Type": "Large Language Model",
922
+ "Dataset": "Ver. of 1P Belief",
923
+ "Score": 88.2,
924
+ "Condition": "False"
925
+ },
926
+ {
927
+ "Model": "GPT-3.5",
928
+ "Type": "Large Language Model",
929
+ "Dataset": "Ver. of 1P Belief",
930
+ "Score": 62.2,
931
+ "Condition": "False"
932
+ },
933
+ {
934
+ "Model": "3 Opus",
935
+ "Type": "Large Language Model",
936
+ "Dataset": "Ver. of 1P Belief",
937
+ "Score": 94.4,
938
+ "Condition": "False"
939
+ },
940
+ {
941
+ "Model": "3 Sonnet",
942
+ "Type": "Large Language Model",
943
+ "Dataset": "Ver. of 1P Belief",
944
+ "Score": 87.4,
945
+ "Condition": "False"
946
+ },
947
+ {
948
+ "Model": "3 Haiku",
949
+ "Type": "Large Language Model",
950
+ "Dataset": "Ver. of 1P Belief",
951
+ "Score": 69.2,
952
+ "Condition": "False"
953
+ },
954
+ {
955
+ "Model": "Mixtral 8x22B",
956
+ "Type": "Large Language Model",
957
+ "Dataset": "Ver. of 1P Belief",
958
+ "Score": 80.8,
959
+ "Condition": "False"
960
+ },
961
+ {
962
+ "Model": "Mixtral 8x7B",
963
+ "Type": "Large Language Model",
964
+ "Dataset": "Ver. of 1P Belief",
965
+ "Score": 62.4,
966
+ "Condition": "False"
967
+ },
968
+ {
969
+ "Model": "Mixtral 7B",
970
+ "Type": "Large Language Model",
971
+ "Dataset": "Ver. of 1P Belief",
972
+ "Score": 30.4,
973
+ "Condition": "False"
974
+ },
975
+ {
976
+ "Model": "Llama-3 70B",
977
+ "Type": "Large Language Model",
978
+ "Dataset": "Ver. of 1P Belief",
979
+ "Score": 87.4,
980
+ "Condition": "False"
981
+ },
982
+ {
983
+ "Model": "Llama-3 8B",
984
+ "Type": "Large Language Model",
985
+ "Dataset": "Ver. of 1P Belief",
986
+ "Score": 75.4,
987
+ "Condition": "False"
988
+ },
989
+ {
990
+ "Model": "Llama-2 70B",
991
+ "Type": "Large Language Model",
992
+ "Dataset": "Ver. of 1P Belief",
993
+ "Score": 87.4,
994
+ "Condition": "False"
995
+ },
996
+ {
997
+ "Model": "Llama-2 13B",
998
+ "Type": "Large Language Model",
999
+ "Dataset": "Ver. of 1P Belief",
1000
+ "Score": 72.8,
1001
+ "Condition": "False"
1002
+ },
1003
+ {
1004
+ "Model": "Llama-2 7B",
1005
+ "Type": "Large Language Model",
1006
+ "Dataset": "Ver. of 1P Belief",
1007
+ "Score": 72.8,
1008
+ "Condition": "False"
1009
+ },
1010
+ {
1011
+ "Model": "o1",
1012
+ "Type": "Large Language Model",
1013
+ "Dataset": "Conf. of 1P Belief",
1014
+ "Score": 99.6,
1015
+ "Condition": "True"
1016
+ },
1017
+ {
1018
+ "Model": "o3-mini",
1019
+ "Type": "Large Language Model",
1020
+ "Dataset": "Conf. of 1P Belief",
1021
+ "Score": 98.0,
1022
+ "Condition": "True"
1023
+ },
1024
+ {
1025
+ "Model": "4o",
1026
+ "Type": "Large Language Model",
1027
+ "Dataset": "Conf. of 1P Belief",
1028
+ "Score": 98.2,
1029
+ "Condition": "True"
1030
+ },
1031
+ {
1032
+ "Model": "3.7 Sonnet",
1033
+ "Type": "Large Language Model",
1034
+ "Dataset": "Conf. of 1P Belief",
1035
+ "Score": 98.6,
1036
+ "Condition": "True"
1037
+ },
1038
+ {
1039
+ "Model": "3.5 Sonnet",
1040
+ "Type": "Large Language Model",
1041
+ "Dataset": "Conf. of 1P Belief",
1042
+ "Score": 99.0,
1043
+ "Condition": "True"
1044
+ },
1045
+ {
1046
+ "Model": "Gemini 2 Flash",
1047
+ "Type": "Large Language Model",
1048
+ "Dataset": "Conf. of 1P Belief",
1049
+ "Score": 99.6,
1050
+ "Condition": "True"
1051
+ },
1052
+ {
1053
+ "Model": "Gemini 2 Flash-Lite",
1054
+ "Type": "Large Language Model",
1055
+ "Dataset": "Conf. of 1P Belief",
1056
+ "Score": 99.8,
1057
+ "Condition": "True"
1058
+ },
1059
+ {
1060
+ "Model": "R1",
1061
+ "Type": "Large Language Model",
1062
+ "Dataset": "Conf. of 1P Belief",
1063
+ "Score": 90.4,
1064
+ "Condition": "True"
1065
+ },
1066
+ {
1067
+ "Model": "R1-Dist.-Llama-70B",
1068
+ "Type": "Large Language Model",
1069
+ "Dataset": "Conf. of 1P Belief",
1070
+ "Score": 96.2,
1071
+ "Condition": "True"
1072
+ },
1073
+ {
1074
+ "Model": "R1-Dist.Qwen-14B",
1075
+ "Type": "Large Language Model",
1076
+ "Dataset": "Conf. of 1P Belief",
1077
+ "Score": 86.8,
1078
+ "Condition": "True"
1079
+ },
1080
+ {
1081
+ "Model": "Llama 3.3-70B Inst. Turbo",
1082
+ "Type": "Large Language Model",
1083
+ "Dataset": "Conf. of 1P Belief",
1084
+ "Score": 100.0,
1085
+ "Condition": "True"
1086
+ },
1087
+ {
1088
+ "Model": "GPT-4",
1089
+ "Type": "Large Language Model",
1090
+ "Dataset": "Conf. of 1P Belief",
1091
+ "Score": 93.4,
1092
+ "Condition": "True"
1093
+ },
1094
+ {
1095
+ "Model": "GPT-3.5",
1096
+ "Type": "Large Language Model",
1097
+ "Dataset": "Conf. of 1P Belief",
1098
+ "Score": 94.8,
1099
+ "Condition": "True"
1100
+ },
1101
+ {
1102
+ "Model": "3 Opus",
1103
+ "Type": "Large Language Model",
1104
+ "Dataset": "Conf. of 1P Belief",
1105
+ "Score": 89.0,
1106
+ "Condition": "True"
1107
+ },
1108
+ {
1109
+ "Model": "3 Sonnet",
1110
+ "Type": "Large Language Model",
1111
+ "Dataset": "Conf. of 1P Belief",
1112
+ "Score": 94.0,
1113
+ "Condition": "True"
1114
+ },
1115
+ {
1116
+ "Model": "3 Haiku",
1117
+ "Type": "Large Language Model",
1118
+ "Dataset": "Conf. of 1P Belief",
1119
+ "Score": 93.4,
1120
+ "Condition": "True"
1121
+ },
1122
+ {
1123
+ "Model": "Mixtral 8x22B",
1124
+ "Type": "Large Language Model",
1125
+ "Dataset": "Conf. of 1P Belief",
1126
+ "Score": 84.2,
1127
+ "Condition": "True"
1128
+ },
1129
+ {
1130
+ "Model": "Mixtral 8x7B",
1131
+ "Type": "Large Language Model",
1132
+ "Dataset": "Conf. of 1P Belief",
1133
+ "Score": 89.4,
1134
+ "Condition": "True"
1135
+ },
1136
+ {
1137
+ "Model": "Mixtral 7B",
1138
+ "Type": "Large Language Model",
1139
+ "Dataset": "Conf. of 1P Belief",
1140
+ "Score": 82.2,
1141
+ "Condition": "True"
1142
+ },
1143
+ {
1144
+ "Model": "Llama-3 70B",
1145
+ "Type": "Large Language Model",
1146
+ "Dataset": "Conf. of 1P Belief",
1147
+ "Score": 96.0,
1148
+ "Condition": "True"
1149
+ },
1150
+ {
1151
+ "Model": "Llama-3 8B",
1152
+ "Type": "Large Language Model",
1153
+ "Dataset": "Conf. of 1P Belief",
1154
+ "Score": 91.0,
1155
+ "Condition": "True"
1156
+ },
1157
+ {
1158
+ "Model": "Llama-2 70B",
1159
+ "Type": "Large Language Model",
1160
+ "Dataset": "Conf. of 1P Belief",
1161
+ "Score": 95.4,
1162
+ "Condition": "True"
1163
+ },
1164
+ {
1165
+ "Model": "Llama-2 13B",
1166
+ "Type": "Large Language Model",
1167
+ "Dataset": "Conf. of 1P Belief",
1168
+ "Score": 90.2,
1169
+ "Condition": "True"
1170
+ },
1171
+ {
1172
+ "Model": "Llama-2 7B",
1173
+ "Type": "Large Language Model",
1174
+ "Dataset": "Conf. of 1P Belief",
1175
+ "Score": 91.2,
1176
+ "Condition": "True"
1177
+ },
1178
+ {
1179
+ "Model": "o1",
1180
+ "Type": "Large Language Model",
1181
+ "Dataset": "Conf. of 1P Belief",
1182
+ "Score": 83.8,
1183
+ "Condition": "False"
1184
+ },
1185
+ {
1186
+ "Model": "o3-mini",
1187
+ "Type": "Large Language Model",
1188
+ "Dataset": "Conf. of 1P Belief",
1189
+ "Score": 66.6,
1190
+ "Condition": "False"
1191
+ },
1192
+ {
1193
+ "Model": "4o",
1194
+ "Type": "Large Language Model",
1195
+ "Dataset": "Conf. of 1P Belief",
1196
+ "Score": 64.4,
1197
+ "Condition": "False"
1198
+ },
1199
+ {
1200
+ "Model": "3.7 Sonnet",
1201
+ "Type": "Large Language Model",
1202
+ "Dataset": "Conf. of 1P Belief",
1203
+ "Score": 67.8,
1204
+ "Condition": "False"
1205
+ },
1206
+ {
1207
+ "Model": "3.5 Sonnet",
1208
+ "Type": "Large Language Model",
1209
+ "Dataset": "Conf. of 1P Belief",
1210
+ "Score": 69.0,
1211
+ "Condition": "False"
1212
+ },
1213
+ {
1214
+ "Model": "Gemini 2 Flash",
1215
+ "Type": "Large Language Model",
1216
+ "Dataset": "Conf. of 1P Belief",
1217
+ "Score": 87.6,
1218
+ "Condition": "False"
1219
+ },
1220
+ {
1221
+ "Model": "Gemini 2 Flash-Lite",
1222
+ "Type": "Large Language Model",
1223
+ "Dataset": "Conf. of 1P Belief",
1224
+ "Score": 92.4,
1225
+ "Condition": "False"
1226
+ },
1227
+ {
1228
+ "Model": "R1",
1229
+ "Type": "Large Language Model",
1230
+ "Dataset": "Conf. of 1P Belief",
1231
+ "Score": 14.4,
1232
+ "Condition": "False"
1233
+ },
1234
+ {
1235
+ "Model": "R1-Dist.-Llama-70B",
1236
+ "Type": "Large Language Model",
1237
+ "Dataset": "Conf. of 1P Belief",
1238
+ "Score": 29.6,
1239
+ "Condition": "False"
1240
+ },
1241
+ {
1242
+ "Model": "R1-Dist.Qwen-14B",
1243
+ "Type": "Large Language Model",
1244
+ "Dataset": "Conf. of 1P Belief",
1245
+ "Score": 18.4,
1246
+ "Condition": "False"
1247
+ },
1248
+ {
1249
+ "Model": "Llama 3.3-70B Inst. Turbo",
1250
+ "Type": "Large Language Model",
1251
+ "Dataset": "Conf. of 1P Belief",
1252
+ "Score": 94.2,
1253
+ "Condition": "False"
1254
+ },
1255
+ {
1256
+ "Model": "GPT-4",
1257
+ "Type": "Large Language Model",
1258
+ "Dataset": "Conf. of 1P Belief",
1259
+ "Score": 22.0,
1260
+ "Condition": "False"
1261
+ },
1262
+ {
1263
+ "Model": "GPT-3.5",
1264
+ "Type": "Large Language Model",
1265
+ "Dataset": "Conf. of 1P Belief",
1266
+ "Score": 51.0,
1267
+ "Condition": "False"
1268
+ },
1269
+ {
1270
+ "Model": "3 Opus",
1271
+ "Type": "Large Language Model",
1272
+ "Dataset": "Conf. of 1P Belief",
1273
+ "Score": 45.6,
1274
+ "Condition": "False"
1275
+ },
1276
+ {
1277
+ "Model": "3 Sonnet",
1278
+ "Type": "Large Language Model",
1279
+ "Dataset": "Conf. of 1P Belief",
1280
+ "Score": 54.8,
1281
+ "Condition": "False"
1282
+ },
1283
+ {
1284
+ "Model": "3 Haiku",
1285
+ "Type": "Large Language Model",
1286
+ "Dataset": "Conf. of 1P Belief",
1287
+ "Score": 50.0,
1288
+ "Condition": "False"
1289
+ },
1290
+ {
1291
+ "Model": "Mixtral 8x22B",
1292
+ "Type": "Large Language Model",
1293
+ "Dataset": "Conf. of 1P Belief",
1294
+ "Score": 18.8,
1295
+ "Condition": "False"
1296
+ },
1297
+ {
1298
+ "Model": "Mixtral 8x7B",
1299
+ "Type": "Large Language Model",
1300
+ "Dataset": "Conf. of 1P Belief",
1301
+ "Score": 44.8,
1302
+ "Condition": "False"
1303
+ },
1304
+ {
1305
+ "Model": "Mixtral 7B",
1306
+ "Type": "Large Language Model",
1307
+ "Dataset": "Conf. of 1P Belief",
1308
+ "Score": 66.8,
1309
+ "Condition": "False"
1310
+ },
1311
+ {
1312
+ "Model": "Llama-3 70B",
1313
+ "Type": "Large Language Model",
1314
+ "Dataset": "Conf. of 1P Belief",
1315
+ "Score": 83.2,
1316
+ "Condition": "False"
1317
+ },
1318
+ {
1319
+ "Model": "Llama-3 8B",
1320
+ "Type": "Large Language Model",
1321
+ "Dataset": "Conf. of 1P Belief",
1322
+ "Score": 55.6,
1323
+ "Condition": "False"
1324
+ },
1325
+ {
1326
+ "Model": "Llama-2 70B",
1327
+ "Type": "Large Language Model",
1328
+ "Dataset": "Conf. of 1P Belief",
1329
+ "Score": 77.2,
1330
+ "Condition": "False"
1331
+ },
1332
+ {
1333
+ "Model": "Llama-2 13B",
1334
+ "Type": "Large Language Model",
1335
+ "Dataset": "Conf. of 1P Belief",
1336
+ "Score": 57.0,
1337
+ "Condition": "False"
1338
+ },
1339
+ {
1340
+ "Model": "Llama-2 7B",
1341
+ "Type": "Large Language Model",
1342
+ "Dataset": "Conf. of 1P Belief",
1343
+ "Score": 55.8,
1344
+ "Condition": "False"
1345
+ },
1346
+ {
1347
+ "Model": "o1",
1348
+ "Type": "Large Language Model",
1349
+ "Dataset": "Intrsp. of 1P Belief",
1350
+ "Score": 97.2,
1351
+ "Condition": "True"
1352
+ },
1353
+ {
1354
+ "Model": "o3-mini",
1355
+ "Type": "Large Language Model",
1356
+ "Dataset": "Intrsp. of 1P Belief",
1357
+ "Score": 93.8,
1358
+ "Condition": "True"
1359
+ },
1360
+ {
1361
+ "Model": "4o",
1362
+ "Type": "Large Language Model",
1363
+ "Dataset": "Intrsp. of 1P Belief",
1364
+ "Score": 98.4,
1365
+ "Condition": "True"
1366
+ },
1367
+ {
1368
+ "Model": "3.7 Sonnet",
1369
+ "Type": "Large Language Model",
1370
+ "Dataset": "Intrsp. of 1P Belief",
1371
+ "Score": 96.8,
1372
+ "Condition": "True"
1373
+ },
1374
+ {
1375
+ "Model": "3.5 Sonnet",
1376
+ "Type": "Large Language Model",
1377
+ "Dataset": "Intrsp. of 1P Belief",
1378
+ "Score": 95.0,
1379
+ "Condition": "True"
1380
+ },
1381
+ {
1382
+ "Model": "Gemini 2 Flash",
1383
+ "Type": "Large Language Model",
1384
+ "Dataset": "Intrsp. of 1P Belief",
1385
+ "Score": 97.8,
1386
+ "Condition": "True"
1387
+ },
1388
+ {
1389
+ "Model": "Gemini 2 Flash-Lite",
1390
+ "Type": "Large Language Model",
1391
+ "Dataset": "Intrsp. of 1P Belief",
1392
+ "Score": 99.0,
1393
+ "Condition": "True"
1394
+ },
1395
+ {
1396
+ "Model": "R1",
1397
+ "Type": "Large Language Model",
1398
+ "Dataset": "Intrsp. of 1P Belief",
1399
+ "Score": 89.2,
1400
+ "Condition": "True"
1401
+ },
1402
+ {
1403
+ "Model": "R1-Dist.-Llama-70B",
1404
+ "Type": "Large Language Model",
1405
+ "Dataset": "Intrsp. of 1P Belief",
1406
+ "Score": 92.8,
1407
+ "Condition": "True"
1408
+ },
1409
+ {
1410
+ "Model": "R1-Dist.Qwen-14B",
1411
+ "Type": "Large Language Model",
1412
+ "Dataset": "Intrsp. of 1P Belief",
1413
+ "Score": 85.4,
1414
+ "Condition": "True"
1415
+ },
1416
+ {
1417
+ "Model": "Llama 3.3-70B Inst. Turbo",
1418
+ "Type": "Large Language Model",
1419
+ "Dataset": "Intrsp. of 1P Belief",
1420
+ "Score": 98.6,
1421
+ "Condition": "True"
1422
+ },
1423
+ {
1424
+ "Model": "GPT-4",
1425
+ "Type": "Large Language Model",
1426
+ "Dataset": "Intrsp. of 1P Belief",
1427
+ "Score": 93.0,
1428
+ "Condition": "True"
1429
+ },
1430
+ {
1431
+ "Model": "GPT-3.5",
1432
+ "Type": "Large Language Model",
1433
+ "Dataset": "Intrsp. of 1P Belief",
1434
+ "Score": 93.2,
1435
+ "Condition": "True"
1436
+ },
1437
+ {
1438
+ "Model": "3 Opus",
1439
+ "Type": "Large Language Model",
1440
+ "Dataset": "Intrsp. of 1P Belief",
1441
+ "Score": 96.2,
1442
+ "Condition": "True"
1443
+ },
1444
+ {
1445
+ "Model": "3 Sonnet",
1446
+ "Type": "Large Language Model",
1447
+ "Dataset": "Intrsp. of 1P Belief",
1448
+ "Score": 93.8,
1449
+ "Condition": "True"
1450
+ },
1451
+ {
1452
+ "Model": "3 Haiku",
1453
+ "Type": "Large Language Model",
1454
+ "Dataset": "Intrsp. of 1P Belief",
1455
+ "Score": 86.0,
1456
+ "Condition": "True"
1457
+ },
1458
+ {
1459
+ "Model": "Mixtral 8x22B",
1460
+ "Type": "Large Language Model",
1461
+ "Dataset": "Intrsp. of 1P Belief",
1462
+ "Score": 81.6,
1463
+ "Condition": "True"
1464
+ },
1465
+ {
1466
+ "Model": "Mixtral 8x7B",
1467
+ "Type": "Large Language Model",
1468
+ "Dataset": "Intrsp. of 1P Belief",
1469
+ "Score": 83.6,
1470
+ "Condition": "True"
1471
+ },
1472
+ {
1473
+ "Model": "Mixtral 7B",
1474
+ "Type": "Large Language Model",
1475
+ "Dataset": "Intrsp. of 1P Belief",
1476
+ "Score": 75.4,
1477
+ "Condition": "True"
1478
+ },
1479
+ {
1480
+ "Model": "Llama-3 70B",
1481
+ "Type": "Large Language Model",
1482
+ "Dataset": "Intrsp. of 1P Belief",
1483
+ "Score": 93.6,
1484
+ "Condition": "True"
1485
+ },
1486
+ {
1487
+ "Model": "Llama-3 8B",
1488
+ "Type": "Large Language Model",
1489
+ "Dataset": "Intrsp. of 1P Belief",
1490
+ "Score": 81.6,
1491
+ "Condition": "True"
1492
+ },
1493
+ {
1494
+ "Model": "Llama-2 70B",
1495
+ "Type": "Large Language Model",
1496
+ "Dataset": "Intrsp. of 1P Belief",
1497
+ "Score": 91.8,
1498
+ "Condition": "True"
1499
+ },
1500
+ {
1501
+ "Model": "Llama-2 13B",
1502
+ "Type": "Large Language Model",
1503
+ "Dataset": "Intrsp. of 1P Belief",
1504
+ "Score": 82.2,
1505
+ "Condition": "True"
1506
+ },
1507
+ {
1508
+ "Model": "Llama-2 7B",
1509
+ "Type": "Large Language Model",
1510
+ "Dataset": "Intrsp. of 1P Belief",
1511
+ "Score": 83.2,
1512
+ "Condition": "True"
1513
+ },
1514
+ {
1515
+ "Model": "o1",
1516
+ "Type": "Large Language Model",
1517
+ "Dataset": "Intrsp. of 1P Belief",
1518
+ "Score": 27.4,
1519
+ "Condition": "False"
1520
+ },
1521
+ {
1522
+ "Model": "o3-mini",
1523
+ "Type": "Large Language Model",
1524
+ "Dataset": "Intrsp. of 1P Belief",
1525
+ "Score": 50.6,
1526
+ "Condition": "False"
1527
+ },
1528
+ {
1529
+ "Model": "4o",
1530
+ "Type": "Large Language Model",
1531
+ "Dataset": "Intrsp. of 1P Belief",
1532
+ "Score": 57.2,
1533
+ "Condition": "False"
1534
+ },
1535
+ {
1536
+ "Model": "3.7 Sonnet",
1537
+ "Type": "Large Language Model",
1538
+ "Dataset": "Intrsp. of 1P Belief",
1539
+ "Score": 39.2,
1540
+ "Condition": "False"
1541
+ },
1542
+ {
1543
+ "Model": "3.5 Sonnet",
1544
+ "Type": "Large Language Model",
1545
+ "Dataset": "Intrsp. of 1P Belief",
1546
+ "Score": 50.0,
1547
+ "Condition": "False"
1548
+ },
1549
+ {
1550
+ "Model": "Gemini 2 Flash",
1551
+ "Type": "Large Language Model",
1552
+ "Dataset": "Intrsp. of 1P Belief",
1553
+ "Score": 63.0,
1554
+ "Condition": "False"
1555
+ },
1556
+ {
1557
+ "Model": "Gemini 2 Flash-Lite",
1558
+ "Type": "Large Language Model",
1559
+ "Dataset": "Intrsp. of 1P Belief",
1560
+ "Score": 84.6,
1561
+ "Condition": "False"
1562
+ },
1563
+ {
1564
+ "Model": "R1",
1565
+ "Type": "Large Language Model",
1566
+ "Dataset": "Intrsp. of 1P Belief",
1567
+ "Score": 18.2,
1568
+ "Condition": "False"
1569
+ },
1570
+ {
1571
+ "Model": "R1-Dist.-Llama-70B",
1572
+ "Type": "Large Language Model",
1573
+ "Dataset": "Intrsp. of 1P Belief",
1574
+ "Score": 16.2,
1575
+ "Condition": "False"
1576
+ },
1577
+ {
1578
+ "Model": "R1-Dist.Qwen-14B",
1579
+ "Type": "Large Language Model",
1580
+ "Dataset": "Intrsp. of 1P Belief",
1581
+ "Score": 19.0,
1582
+ "Condition": "False"
1583
+ },
1584
+ {
1585
+ "Model": "Llama 3.3-70B Inst. Turbo",
1586
+ "Type": "Large Language Model",
1587
+ "Dataset": "Intrsp. of 1P Belief",
1588
+ "Score": 63.6,
1589
+ "Condition": "False"
1590
+ },
1591
+ {
1592
+ "Model": "GPT-4",
1593
+ "Type": "Large Language Model",
1594
+ "Dataset": "Intrsp. of 1P Belief",
1595
+ "Score": 17.6,
1596
+ "Condition": "False"
1597
+ },
1598
+ {
1599
+ "Model": "GPT-3.5",
1600
+ "Type": "Large Language Model",
1601
+ "Dataset": "Intrsp. of 1P Belief",
1602
+ "Score": 46.2,
1603
+ "Condition": "False"
1604
+ },
1605
+ {
1606
+ "Model": "3 Opus",
1607
+ "Type": "Large Language Model",
1608
+ "Dataset": "Intrsp. of 1P Belief",
1609
+ "Score": 55.8,
1610
+ "Condition": "False"
1611
+ },
1612
+ {
1613
+ "Model": "3 Sonnet",
1614
+ "Type": "Large Language Model",
1615
+ "Dataset": "Intrsp. of 1P Belief",
1616
+ "Score": 46.8,
1617
+ "Condition": "False"
1618
+ },
1619
+ {
1620
+ "Model": "3 Haiku",
1621
+ "Type": "Large Language Model",
1622
+ "Dataset": "Intrsp. of 1P Belief",
1623
+ "Score": 34.2,
1624
+ "Condition": "False"
1625
+ },
1626
+ {
1627
+ "Model": "Mixtral 8x22B",
1628
+ "Type": "Large Language Model",
1629
+ "Dataset": "Intrsp. of 1P Belief",
1630
+ "Score": 19.2,
1631
+ "Condition": "False"
1632
+ },
1633
+ {
1634
+ "Model": "Mixtral 8x7B",
1635
+ "Type": "Large Language Model",
1636
+ "Dataset": "Intrsp. of 1P Belief",
1637
+ "Score": 44.6,
1638
+ "Condition": "False"
1639
+ },
1640
+ {
1641
+ "Model": "Mixtral 7B",
1642
+ "Type": "Large Language Model",
1643
+ "Dataset": "Intrsp. of 1P Belief",
1644
+ "Score": 58.4,
1645
+ "Condition": "False"
1646
+ },
1647
+ {
1648
+ "Model": "Llama-3 70B",
1649
+ "Type": "Large Language Model",
1650
+ "Dataset": "Intrsp. of 1P Belief",
1651
+ "Score": 58.2,
1652
+ "Condition": "False"
1653
+ },
1654
+ {
1655
+ "Model": "Llama-3 8B",
1656
+ "Type": "Large Language Model",
1657
+ "Dataset": "Intrsp. of 1P Belief",
1658
+ "Score": 41.2,
1659
+ "Condition": "False"
1660
+ },
1661
+ {
1662
+ "Model": "Llama-2 70B",
1663
+ "Type": "Large Language Model",
1664
+ "Dataset": "Intrsp. of 1P Belief",
1665
+ "Score": 56.2,
1666
+ "Condition": "False"
1667
+ },
1668
+ {
1669
+ "Model": "Llama-2 13B",
1670
+ "Type": "Large Language Model",
1671
+ "Dataset": "Intrsp. of 1P Belief",
1672
+ "Score": 41.6,
1673
+ "Condition": "False"
1674
+ },
1675
+ {
1676
+ "Model": "Llama-2 7B",
1677
+ "Type": "Large Language Model",
1678
+ "Dataset": "Intrsp. of 1P Belief",
1679
+ "Score": 43.0,
1680
+ "Condition": "False"
1681
+ },
1682
+ {
1683
+ "Model": "o1",
1684
+ "Type": "Large Language Model",
1685
+ "Dataset": "Conf. of 3P Belief (J)",
1686
+ "Score": 100.0,
1687
+ "Condition": "True"
1688
+ },
1689
+ {
1690
+ "Model": "o3-mini",
1691
+ "Type": "Large Language Model",
1692
+ "Dataset": "Conf. of 3P Belief (J)",
1693
+ "Score": 100.0,
1694
+ "Condition": "True"
1695
+ },
1696
+ {
1697
+ "Model": "4o",
1698
+ "Type": "Large Language Model",
1699
+ "Dataset": "Conf. of 3P Belief (J)",
1700
+ "Score": 99.0,
1701
+ "Condition": "True"
1702
+ },
1703
+ {
1704
+ "Model": "3.7 Sonnet",
1705
+ "Type": "Large Language Model",
1706
+ "Dataset": "Conf. of 3P Belief (J)",
1707
+ "Score": 99.8,
1708
+ "Condition": "True"
1709
+ },
1710
+ {
1711
+ "Model": "3.5 Sonnet",
1712
+ "Type": "Large Language Model",
1713
+ "Dataset": "Conf. of 3P Belief (J)",
1714
+ "Score": 99.8,
1715
+ "Condition": "True"
1716
+ },
1717
+ {
1718
+ "Model": "Gemini 2 Flash",
1719
+ "Type": "Large Language Model",
1720
+ "Dataset": "Conf. of 3P Belief (J)",
1721
+ "Score": 100.0,
1722
+ "Condition": "True"
1723
+ },
1724
+ {
1725
+ "Model": "Gemini 2 Flash-Lite",
1726
+ "Type": "Large Language Model",
1727
+ "Dataset": "Conf. of 3P Belief (J)",
1728
+ "Score": 100.0,
1729
+ "Condition": "True"
1730
+ },
1731
+ {
1732
+ "Model": "R1",
1733
+ "Type": "Large Language Model",
1734
+ "Dataset": "Conf. of 3P Belief (J)",
1735
+ "Score": 99.2,
1736
+ "Condition": "True"
1737
+ },
1738
+ {
1739
+ "Model": "R1-Dist.-Llama-70B",
1740
+ "Type": "Large Language Model",
1741
+ "Dataset": "Conf. of 3P Belief (J)",
1742
+ "Score": 99.6,
1743
+ "Condition": "True"
1744
+ },
1745
+ {
1746
+ "Model": "R1-Dist.Qwen-14B",
1747
+ "Type": "Large Language Model",
1748
+ "Dataset": "Conf. of 3P Belief (J)",
1749
+ "Score": 97.8,
1750
+ "Condition": "True"
1751
+ },
1752
+ {
1753
+ "Model": "Llama 3.3-70B Inst. Turbo",
1754
+ "Type": "Large Language Model",
1755
+ "Dataset": "Conf. of 3P Belief (J)",
1756
+ "Score": 100.0,
1757
+ "Condition": "True"
1758
+ },
1759
+ {
1760
+ "Model": "GPT-4",
1761
+ "Type": "Large Language Model",
1762
+ "Dataset": "Conf. of 3P Belief (J)",
1763
+ "Score": 98.4,
1764
+ "Condition": "True"
1765
+ },
1766
+ {
1767
+ "Model": "GPT-3.5",
1768
+ "Type": "Large Language Model",
1769
+ "Dataset": "Conf. of 3P Belief (J)",
1770
+ "Score": 95.6,
1771
+ "Condition": "True"
1772
+ },
1773
+ {
1774
+ "Model": "3 Opus",
1775
+ "Type": "Large Language Model",
1776
+ "Dataset": "Conf. of 3P Belief (J)",
1777
+ "Score": 96.6,
1778
+ "Condition": "True"
1779
+ },
1780
+ {
1781
+ "Model": "3 Sonnet",
1782
+ "Type": "Large Language Model",
1783
+ "Dataset": "Conf. of 3P Belief (J)",
1784
+ "Score": 97.2,
1785
+ "Condition": "True"
1786
+ },
1787
+ {
1788
+ "Model": "3 Haiku",
1789
+ "Type": "Large Language Model",
1790
+ "Dataset": "Conf. of 3P Belief (J)",
1791
+ "Score": 97.6,
1792
+ "Condition": "True"
1793
+ },
1794
+ {
1795
+ "Model": "Mixtral 8x22B",
1796
+ "Type": "Large Language Model",
1797
+ "Dataset": "Conf. of 3P Belief (J)",
1798
+ "Score": 98.0,
1799
+ "Condition": "True"
1800
+ },
1801
+ {
1802
+ "Model": "Mixtral 8x7B",
1803
+ "Type": "Large Language Model",
1804
+ "Dataset": "Conf. of 3P Belief (J)",
1805
+ "Score": 87.2,
1806
+ "Condition": "True"
1807
+ },
1808
+ {
1809
+ "Model": "Mixtral 7B",
1810
+ "Type": "Large Language Model",
1811
+ "Dataset": "Conf. of 3P Belief (J)",
1812
+ "Score": 92.4,
1813
+ "Condition": "True"
1814
+ },
1815
+ {
1816
+ "Model": "Llama-3 70B",
1817
+ "Type": "Large Language Model",
1818
+ "Dataset": "Conf. of 3P Belief (J)",
1819
+ "Score": 96.2,
1820
+ "Condition": "True"
1821
+ },
1822
+ {
1823
+ "Model": "Llama-3 8B",
1824
+ "Type": "Large Language Model",
1825
+ "Dataset": "Conf. of 3P Belief (J)",
1826
+ "Score": 93.2,
1827
+ "Condition": "True"
1828
+ },
1829
+ {
1830
+ "Model": "Llama-2 70B",
1831
+ "Type": "Large Language Model",
1832
+ "Dataset": "Conf. of 3P Belief (J)",
1833
+ "Score": 96.2,
1834
+ "Condition": "True"
1835
+ },
1836
+ {
1837
+ "Model": "Llama-2 13B",
1838
+ "Type": "Large Language Model",
1839
+ "Dataset": "Conf. of 3P Belief (J)",
1840
+ "Score": 93.6,
1841
+ "Condition": "True"
1842
+ },
1843
+ {
1844
+ "Model": "Llama-2 7B",
1845
+ "Type": "Large Language Model",
1846
+ "Dataset": "Conf. of 3P Belief (J)",
1847
+ "Score": 93.6,
1848
+ "Condition": "True"
1849
+ },
1850
+ {
1851
+ "Model": "o1",
1852
+ "Type": "Large Language Model",
1853
+ "Dataset": "Conf. of 3P Belief (J)",
1854
+ "Score": 99.2,
1855
+ "Condition": "False"
1856
+ },
1857
+ {
1858
+ "Model": "o3-mini",
1859
+ "Type": "Large Language Model",
1860
+ "Dataset": "Conf. of 3P Belief (J)",
1861
+ "Score": 99.6,
1862
+ "Condition": "False"
1863
+ },
1864
+ {
1865
+ "Model": "4o",
1866
+ "Type": "Large Language Model",
1867
+ "Dataset": "Conf. of 3P Belief (J)",
1868
+ "Score": 87.4,
1869
+ "Condition": "False"
1870
+ },
1871
+ {
1872
+ "Model": "3.7 Sonnet",
1873
+ "Type": "Large Language Model",
1874
+ "Dataset": "Conf. of 3P Belief (J)",
1875
+ "Score": 98.4,
1876
+ "Condition": "False"
1877
+ },
1878
+ {
1879
+ "Model": "3.5 Sonnet",
1880
+ "Type": "Large Language Model",
1881
+ "Dataset": "Conf. of 3P Belief (J)",
1882
+ "Score": 97.2,
1883
+ "Condition": "False"
1884
+ },
1885
+ {
1886
+ "Model": "Gemini 2 Flash",
1887
+ "Type": "Large Language Model",
1888
+ "Dataset": "Conf. of 3P Belief (J)",
1889
+ "Score": 99.0,
1890
+ "Condition": "False"
1891
+ },
1892
+ {
1893
+ "Model": "Gemini 2 Flash-Lite",
1894
+ "Type": "Large Language Model",
1895
+ "Dataset": "Conf. of 3P Belief (J)",
1896
+ "Score": 94.6,
1897
+ "Condition": "False"
1898
+ },
1899
+ {
1900
+ "Model": "R1",
1901
+ "Type": "Large Language Model",
1902
+ "Dataset": "Conf. of 3P Belief (J)",
1903
+ "Score": 94.2,
1904
+ "Condition": "False"
1905
+ },
1906
+ {
1907
+ "Model": "R1-Dist.-Llama-70B",
1908
+ "Type": "Large Language Model",
1909
+ "Dataset": "Conf. of 3P Belief (J)",
1910
+ "Score": 96.4,
1911
+ "Condition": "False"
1912
+ },
1913
+ {
1914
+ "Model": "R1-Dist.Qwen-14B",
1915
+ "Type": "Large Language Model",
1916
+ "Dataset": "Conf. of 3P Belief (J)",
1917
+ "Score": 79.6,
1918
+ "Condition": "False"
1919
+ },
1920
+ {
1921
+ "Model": "Llama 3.3-70B Inst. Turbo",
1922
+ "Type": "Large Language Model",
1923
+ "Dataset": "Conf. of 3P Belief (J)",
1924
+ "Score": 99.6,
1925
+ "Condition": "False"
1926
+ },
1927
+ {
1928
+ "Model": "GPT-4",
1929
+ "Type": "Large Language Model",
1930
+ "Dataset": "Conf. of 3P Belief (J)",
1931
+ "Score": 74.0,
1932
+ "Condition": "False"
1933
+ },
1934
+ {
1935
+ "Model": "GPT-3.5",
1936
+ "Type": "Large Language Model",
1937
+ "Dataset": "Conf. of 3P Belief (J)",
1938
+ "Score": 62.4,
1939
+ "Condition": "False"
1940
+ },
1941
+ {
1942
+ "Model": "3 Opus",
1943
+ "Type": "Large Language Model",
1944
+ "Dataset": "Conf. of 3P Belief (J)",
1945
+ "Score": 87.2,
1946
+ "Condition": "False"
1947
+ },
1948
+ {
1949
+ "Model": "3 Sonnet",
1950
+ "Type": "Large Language Model",
1951
+ "Dataset": "Conf. of 3P Belief (J)",
1952
+ "Score": 86.0,
1953
+ "Condition": "False"
1954
+ },
1955
+ {
1956
+ "Model": "3 Haiku",
1957
+ "Type": "Large Language Model",
1958
+ "Dataset": "Conf. of 3P Belief (J)",
1959
+ "Score": 76.2,
1960
+ "Condition": "False"
1961
+ },
1962
+ {
1963
+ "Model": "Mixtral 8x22B",
1964
+ "Type": "Large Language Model",
1965
+ "Dataset": "Conf. of 3P Belief (J)",
1966
+ "Score": 83.6,
1967
+ "Condition": "False"
1968
+ },
1969
+ {
1970
+ "Model": "Mixtral 8x7B",
1971
+ "Type": "Large Language Model",
1972
+ "Dataset": "Conf. of 3P Belief (J)",
1973
+ "Score": 55.0,
1974
+ "Condition": "False"
1975
+ },
1976
+ {
1977
+ "Model": "Mixtral 7B",
1978
+ "Type": "Large Language Model",
1979
+ "Dataset": "Conf. of 3P Belief (J)",
1980
+ "Score": 84.6,
1981
+ "Condition": "False"
1982
+ },
1983
+ {
1984
+ "Model": "Llama-3 70B",
1985
+ "Type": "Large Language Model",
1986
+ "Dataset": "Conf. of 3P Belief (J)",
1987
+ "Score": 88.6,
1988
+ "Condition": "False"
1989
+ },
1990
+ {
1991
+ "Model": "Llama-3 8B",
1992
+ "Type": "Large Language Model",
1993
+ "Dataset": "Conf. of 3P Belief (J)",
1994
+ "Score": 79.6,
1995
+ "Condition": "False"
1996
+ },
1997
+ {
1998
+ "Model": "Llama-2 70B",
1999
+ "Type": "Large Language Model",
2000
+ "Dataset": "Conf. of 3P Belief (J)",
2001
+ "Score": 87.6,
2002
+ "Condition": "False"
2003
+ },
2004
+ {
2005
+ "Model": "Llama-2 13B",
2006
+ "Type": "Large Language Model",
2007
+ "Dataset": "Conf. of 3P Belief (J)",
2008
+ "Score": 79.6,
2009
+ "Condition": "False"
2010
+ },
2011
+ {
2012
+ "Model": "Llama-2 7B",
2013
+ "Type": "Large Language Model",
2014
+ "Dataset": "Conf. of 3P Belief (J)",
2015
+ "Score": 79.8,
2016
+ "Condition": "False"
2017
+ },
2018
+ {
2019
+ "Model": "o1",
2020
+ "Type": "Large Language Model",
2021
+ "Dataset": "Conf. of 3P Belief (M)",
2022
+ "Score": 100.0,
2023
+ "Condition": "True"
2024
+ },
2025
+ {
2026
+ "Model": "o3-mini",
2027
+ "Type": "Large Language Model",
2028
+ "Dataset": "Conf. of 3P Belief (M)",
2029
+ "Score": 100.0,
2030
+ "Condition": "True"
2031
+ },
2032
+ {
2033
+ "Model": "4o",
2034
+ "Type": "Large Language Model",
2035
+ "Dataset": "Conf. of 3P Belief (M)",
2036
+ "Score": 98.8,
2037
+ "Condition": "True"
2038
+ },
2039
+ {
2040
+ "Model": "3.7 Sonnet",
2041
+ "Type": "Large Language Model",
2042
+ "Dataset": "Conf. of 3P Belief (M)",
2043
+ "Score": 99.8,
2044
+ "Condition": "True"
2045
+ },
2046
+ {
2047
+ "Model": "3.5 Sonnet",
2048
+ "Type": "Large Language Model",
2049
+ "Dataset": "Conf. of 3P Belief (M)",
2050
+ "Score": 100.0,
2051
+ "Condition": "True"
2052
+ },
2053
+ {
2054
+ "Model": "Gemini 2 Flash",
2055
+ "Type": "Large Language Model",
2056
+ "Dataset": "Conf. of 3P Belief (M)",
2057
+ "Score": 100.0,
2058
+ "Condition": "True"
2059
+ },
2060
+ {
2061
+ "Model": "Gemini 2 Flash-Lite",
2062
+ "Type": "Large Language Model",
2063
+ "Dataset": "Conf. of 3P Belief (M)",
2064
+ "Score": 99.6,
2065
+ "Condition": "True"
2066
+ },
2067
+ {
2068
+ "Model": "R1",
2069
+ "Type": "Large Language Model",
2070
+ "Dataset": "Conf. of 3P Belief (M)",
2071
+ "Score": 98.2,
2072
+ "Condition": "True"
2073
+ },
2074
+ {
2075
+ "Model": "R1-Dist.-Llama-70B",
2076
+ "Type": "Large Language Model",
2077
+ "Dataset": "Conf. of 3P Belief (M)",
2078
+ "Score": 99.8,
2079
+ "Condition": "True"
2080
+ },
2081
+ {
2082
+ "Model": "R1-Dist.Qwen-14B",
2083
+ "Type": "Large Language Model",
2084
+ "Dataset": "Conf. of 3P Belief (M)",
2085
+ "Score": 99.2,
2086
+ "Condition": "True"
2087
+ },
2088
+ {
2089
+ "Model": "Llama 3.3-70B Inst. Turbo",
2090
+ "Type": "Large Language Model",
2091
+ "Dataset": "Conf. of 3P Belief (M)",
2092
+ "Score": 100.0,
2093
+ "Condition": "True"
2094
+ },
2095
+ {
2096
+ "Model": "GPT-4",
2097
+ "Type": "Large Language Model",
2098
+ "Dataset": "Conf. of 3P Belief (M)",
2099
+ "Score": 98.4,
2100
+ "Condition": "True"
2101
+ },
2102
+ {
2103
+ "Model": "GPT-3.5",
2104
+ "Type": "Large Language Model",
2105
+ "Dataset": "Conf. of 3P Belief (M)",
2106
+ "Score": 95.0,
2107
+ "Condition": "True"
2108
+ },
2109
+ {
2110
+ "Model": "3 Opus",
2111
+ "Type": "Large Language Model",
2112
+ "Dataset": "Conf. of 3P Belief (M)",
2113
+ "Score": 96.6,
2114
+ "Condition": "True"
2115
+ },
2116
+ {
2117
+ "Model": "3 Sonnet",
2118
+ "Type": "Large Language Model",
2119
+ "Dataset": "Conf. of 3P Belief (M)",
2120
+ "Score": 97.4,
2121
+ "Condition": "True"
2122
+ },
2123
+ {
2124
+ "Model": "3 Haiku",
2125
+ "Type": "Large Language Model",
2126
+ "Dataset": "Conf. of 3P Belief (M)",
2127
+ "Score": 97.0,
2128
+ "Condition": "True"
2129
+ },
2130
+ {
2131
+ "Model": "Mixtral 8x22B",
2132
+ "Type": "Large Language Model",
2133
+ "Dataset": "Conf. of 3P Belief (M)",
2134
+ "Score": 97.8,
2135
+ "Condition": "True"
2136
+ },
2137
+ {
2138
+ "Model": "Mixtral 8x7B",
2139
+ "Type": "Large Language Model",
2140
+ "Dataset": "Conf. of 3P Belief (M)",
2141
+ "Score": 87.8,
2142
+ "Condition": "True"
2143
+ },
2144
+ {
2145
+ "Model": "Mixtral 7B",
2146
+ "Type": "Large Language Model",
2147
+ "Dataset": "Conf. of 3P Belief (M)",
2148
+ "Score": 87.4,
2149
+ "Condition": "True"
2150
+ },
2151
+ {
2152
+ "Model": "Llama-3 70B",
2153
+ "Type": "Large Language Model",
2154
+ "Dataset": "Conf. of 3P Belief (M)",
2155
+ "Score": 96.6,
2156
+ "Condition": "True"
2157
+ },
2158
+ {
2159
+ "Model": "Llama-3 8B",
2160
+ "Type": "Large Language Model",
2161
+ "Dataset": "Conf. of 3P Belief (M)",
2162
+ "Score": 93.4,
2163
+ "Condition": "True"
2164
+ },
2165
+ {
2166
+ "Model": "Llama-2 70B",
2167
+ "Type": "Large Language Model",
2168
+ "Dataset": "Conf. of 3P Belief (M)",
2169
+ "Score": 96.0,
2170
+ "Condition": "True"
2171
+ },
2172
+ {
2173
+ "Model": "Llama-2 13B",
2174
+ "Type": "Large Language Model",
2175
+ "Dataset": "Conf. of 3P Belief (M)",
2176
+ "Score": 93.6,
2177
+ "Condition": "True"
2178
+ },
2179
+ {
2180
+ "Model": "Llama-2 7B",
2181
+ "Type": "Large Language Model",
2182
+ "Dataset": "Conf. of 3P Belief (M)",
2183
+ "Score": 93.6,
2184
+ "Condition": "True"
2185
+ },
2186
+ {
2187
+ "Model": "o1",
2188
+ "Type": "Large Language Model",
2189
+ "Dataset": "Conf. of 3P Belief (M)",
2190
+ "Score": 98.2,
2191
+ "Condition": "False"
2192
+ },
2193
+ {
2194
+ "Model": "o3-mini",
2195
+ "Type": "Large Language Model",
2196
+ "Dataset": "Conf. of 3P Belief (M)",
2197
+ "Score": 99.4,
2198
+ "Condition": "False"
2199
+ },
2200
+ {
2201
+ "Model": "4o",
2202
+ "Type": "Large Language Model",
2203
+ "Dataset": "Conf. of 3P Belief (M)",
2204
+ "Score": 87.0,
2205
+ "Condition": "False"
2206
+ },
2207
+ {
2208
+ "Model": "3.7 Sonnet",
2209
+ "Type": "Large Language Model",
2210
+ "Dataset": "Conf. of 3P Belief (M)",
2211
+ "Score": 99.2,
2212
+ "Condition": "False"
2213
+ },
2214
+ {
2215
+ "Model": "3.5 Sonnet",
2216
+ "Type": "Large Language Model",
2217
+ "Dataset": "Conf. of 3P Belief (M)",
2218
+ "Score": 97.8,
2219
+ "Condition": "False"
2220
+ },
2221
+ {
2222
+ "Model": "Gemini 2 Flash",
2223
+ "Type": "Large Language Model",
2224
+ "Dataset": "Conf. of 3P Belief (M)",
2225
+ "Score": 99.6,
2226
+ "Condition": "False"
2227
+ },
2228
+ {
2229
+ "Model": "Gemini 2 Flash-Lite",
2230
+ "Type": "Large Language Model",
2231
+ "Dataset": "Conf. of 3P Belief (M)",
2232
+ "Score": 94.2,
2233
+ "Condition": "False"
2234
+ },
2235
+ {
2236
+ "Model": "R1",
2237
+ "Type": "Large Language Model",
2238
+ "Dataset": "Conf. of 3P Belief (M)",
2239
+ "Score": 90.6,
2240
+ "Condition": "False"
2241
+ },
2242
+ {
2243
+ "Model": "R1-Dist.-Llama-70B",
2244
+ "Type": "Large Language Model",
2245
+ "Dataset": "Conf. of 3P Belief (M)",
2246
+ "Score": 98.0,
2247
+ "Condition": "False"
2248
+ },
2249
+ {
2250
+ "Model": "R1-Dist.Qwen-14B",
2251
+ "Type": "Large Language Model",
2252
+ "Dataset": "Conf. of 3P Belief (M)",
2253
+ "Score": 86.6,
2254
+ "Condition": "False"
2255
+ },
2256
+ {
2257
+ "Model": "Llama 3.3-70B Inst. Turbo",
2258
+ "Type": "Large Language Model",
2259
+ "Dataset": "Conf. of 3P Belief (M)",
2260
+ "Score": 99.8,
2261
+ "Condition": "False"
2262
+ },
2263
+ {
2264
+ "Model": "GPT-4",
2265
+ "Type": "Large Language Model",
2266
+ "Dataset": "Conf. of 3P Belief (M)",
2267
+ "Score": 77.6,
2268
+ "Condition": "False"
2269
+ },
2270
+ {
2271
+ "Model": "GPT-3.5",
2272
+ "Type": "Large Language Model",
2273
+ "Dataset": "Conf. of 3P Belief (M)",
2274
+ "Score": 63.6,
2275
+ "Condition": "False"
2276
+ },
2277
+ {
2278
+ "Model": "3 Opus",
2279
+ "Type": "Large Language Model",
2280
+ "Dataset": "Conf. of 3P Belief (M)",
2281
+ "Score": 89.4,
2282
+ "Condition": "False"
2283
+ },
2284
+ {
2285
+ "Model": "3 Sonnet",
2286
+ "Type": "Large Language Model",
2287
+ "Dataset": "Conf. of 3P Belief (M)",
2288
+ "Score": 88.0,
2289
+ "Condition": "False"
2290
+ },
2291
+ {
2292
+ "Model": "3 Haiku",
2293
+ "Type": "Large Language Model",
2294
+ "Dataset": "Conf. of 3P Belief (M)",
2295
+ "Score": 75.4,
2296
+ "Condition": "False"
2297
+ },
2298
+ {
2299
+ "Model": "Mixtral 8x22B",
2300
+ "Type": "Large Language Model",
2301
+ "Dataset": "Conf. of 3P Belief (M)",
2302
+ "Score": 86.2,
2303
+ "Condition": "False"
2304
+ },
2305
+ {
2306
+ "Model": "Mixtral 8x7B",
2307
+ "Type": "Large Language Model",
2308
+ "Dataset": "Conf. of 3P Belief (M)",
2309
+ "Score": 55.8,
2310
+ "Condition": "False"
2311
+ },
2312
+ {
2313
+ "Model": "Mixtral 7B",
2314
+ "Type": "Large Language Model",
2315
+ "Dataset": "Conf. of 3P Belief (M)",
2316
+ "Score": 76.6,
2317
+ "Condition": "False"
2318
+ },
2319
+ {
2320
+ "Model": "Llama-3 70B",
2321
+ "Type": "Large Language Model",
2322
+ "Dataset": "Conf. of 3P Belief (M)",
2323
+ "Score": 90.2,
2324
+ "Condition": "False"
2325
+ },
2326
+ {
2327
+ "Model": "Llama-3 8B",
2328
+ "Type": "Large Language Model",
2329
+ "Dataset": "Conf. of 3P Belief (M)",
2330
+ "Score": 79.0,
2331
+ "Condition": "False"
2332
+ },
2333
+ {
2334
+ "Model": "Llama-2 70B",
2335
+ "Type": "Large Language Model",
2336
+ "Dataset": "Conf. of 3P Belief (M)",
2337
+ "Score": 89.4,
2338
+ "Condition": "False"
2339
+ },
2340
+ {
2341
+ "Model": "Llama-2 13B",
2342
+ "Type": "Large Language Model",
2343
+ "Dataset": "Conf. of 3P Belief (M)",
2344
+ "Score": 79.0,
2345
+ "Condition": "False"
2346
+ },
2347
+ {
2348
+ "Model": "Llama-2 7B",
2349
+ "Type": "Large Language Model",
2350
+ "Dataset": "Conf. of 3P Belief (M)",
2351
+ "Score": 79.0,
2352
+ "Condition": "False"
2353
+ },
2354
+ {
2355
+ "Model": "o1",
2356
+ "Type": "Large Language Model",
2357
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2358
+ "Score": 99.8,
2359
+ "Condition": "True"
2360
+ },
2361
+ {
2362
+ "Model": "o3-mini",
2363
+ "Type": "Large Language Model",
2364
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2365
+ "Score": 100.0,
2366
+ "Condition": "True"
2367
+ },
2368
+ {
2369
+ "Model": "4o",
2370
+ "Type": "Large Language Model",
2371
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2372
+ "Score": 99.2,
2373
+ "Condition": "True"
2374
+ },
2375
+ {
2376
+ "Model": "3.7 Sonnet",
2377
+ "Type": "Large Language Model",
2378
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2379
+ "Score": 100.0,
2380
+ "Condition": "True"
2381
+ },
2382
+ {
2383
+ "Model": "3.5 Sonnet",
2384
+ "Type": "Large Language Model",
2385
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2386
+ "Score": 100.0,
2387
+ "Condition": "True"
2388
+ },
2389
+ {
2390
+ "Model": "Gemini 2 Flash",
2391
+ "Type": "Large Language Model",
2392
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2393
+ "Score": 100.0,
2394
+ "Condition": "True"
2395
+ },
2396
+ {
2397
+ "Model": "Gemini 2 Flash-Lite",
2398
+ "Type": "Large Language Model",
2399
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2400
+ "Score": 100.0,
2401
+ "Condition": "True"
2402
+ },
2403
+ {
2404
+ "Model": "R1",
2405
+ "Type": "Large Language Model",
2406
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2407
+ "Score": 100.0,
2408
+ "Condition": "True"
2409
+ },
2410
+ {
2411
+ "Model": "R1-Dist.-Llama-70B",
2412
+ "Type": "Large Language Model",
2413
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2414
+ "Score": 100.0,
2415
+ "Condition": "True"
2416
+ },
2417
+ {
2418
+ "Model": "R1-Dist.Qwen-14B",
2419
+ "Type": "Large Language Model",
2420
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2421
+ "Score": 100.0,
2422
+ "Condition": "True"
2423
+ },
2424
+ {
2425
+ "Model": "Llama 3.3-70B Inst. Turbo",
2426
+ "Type": "Large Language Model",
2427
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2428
+ "Score": 100.0,
2429
+ "Condition": "True"
2430
+ },
2431
+ {
2432
+ "Model": "GPT-4",
2433
+ "Type": "Large Language Model",
2434
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2435
+ "Score": 99.0,
2436
+ "Condition": "True"
2437
+ },
2438
+ {
2439
+ "Model": "GPT-3.5",
2440
+ "Type": "Large Language Model",
2441
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2442
+ "Score": 95.2,
2443
+ "Condition": "True"
2444
+ },
2445
+ {
2446
+ "Model": "3 Opus",
2447
+ "Type": "Large Language Model",
2448
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2449
+ "Score": 96.6,
2450
+ "Condition": "True"
2451
+ },
2452
+ {
2453
+ "Model": "3 Sonnet",
2454
+ "Type": "Large Language Model",
2455
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2456
+ "Score": 97.8,
2457
+ "Condition": "True"
2458
+ },
2459
+ {
2460
+ "Model": "3 Haiku",
2461
+ "Type": "Large Language Model",
2462
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2463
+ "Score": 98.8,
2464
+ "Condition": "True"
2465
+ },
2466
+ {
2467
+ "Model": "Mixtral 8x22B",
2468
+ "Type": "Large Language Model",
2469
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2470
+ "Score": 98.4,
2471
+ "Condition": "True"
2472
+ },
2473
+ {
2474
+ "Model": "Mixtral 8x7B",
2475
+ "Type": "Large Language Model",
2476
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2477
+ "Score": 97.0,
2478
+ "Condition": "True"
2479
+ },
2480
+ {
2481
+ "Model": "Mixtral 7B",
2482
+ "Type": "Large Language Model",
2483
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2484
+ "Score": 90.8,
2485
+ "Condition": "True"
2486
+ },
2487
+ {
2488
+ "Model": "Llama-3 70B",
2489
+ "Type": "Large Language Model",
2490
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2491
+ "Score": 96.6,
2492
+ "Condition": "True"
2493
+ },
2494
+ {
2495
+ "Model": "Llama-3 8B",
2496
+ "Type": "Large Language Model",
2497
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2498
+ "Score": 96.2,
2499
+ "Condition": "True"
2500
+ },
2501
+ {
2502
+ "Model": "Llama-2 70B",
2503
+ "Type": "Large Language Model",
2504
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2505
+ "Score": 96.0,
2506
+ "Condition": "True"
2507
+ },
2508
+ {
2509
+ "Model": "Llama-2 13B",
2510
+ "Type": "Large Language Model",
2511
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2512
+ "Score": 96.0,
2513
+ "Condition": "True"
2514
+ },
2515
+ {
2516
+ "Model": "Llama-2 7B",
2517
+ "Type": "Large Language Model",
2518
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2519
+ "Score": 96.0,
2520
+ "Condition": "True"
2521
+ },
2522
+ {
2523
+ "Model": "o1",
2524
+ "Type": "Large Language Model",
2525
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2526
+ "Score": 100.0,
2527
+ "Condition": "False"
2528
+ },
2529
+ {
2530
+ "Model": "o3-mini",
2531
+ "Type": "Large Language Model",
2532
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2533
+ "Score": 100.0,
2534
+ "Condition": "False"
2535
+ },
2536
+ {
2537
+ "Model": "4o",
2538
+ "Type": "Large Language Model",
2539
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2540
+ "Score": 92.6,
2541
+ "Condition": "False"
2542
+ },
2543
+ {
2544
+ "Model": "3.7 Sonnet",
2545
+ "Type": "Large Language Model",
2546
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2547
+ "Score": 100.0,
2548
+ "Condition": "False"
2549
+ },
2550
+ {
2551
+ "Model": "3.5 Sonnet",
2552
+ "Type": "Large Language Model",
2553
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2554
+ "Score": 100.0,
2555
+ "Condition": "False"
2556
+ },
2557
+ {
2558
+ "Model": "Gemini 2 Flash",
2559
+ "Type": "Large Language Model",
2560
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2561
+ "Score": 100.0,
2562
+ "Condition": "False"
2563
+ },
2564
+ {
2565
+ "Model": "Gemini 2 Flash-Lite",
2566
+ "Type": "Large Language Model",
2567
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2568
+ "Score": 100.0,
2569
+ "Condition": "False"
2570
+ },
2571
+ {
2572
+ "Model": "R1",
2573
+ "Type": "Large Language Model",
2574
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2575
+ "Score": 100.0,
2576
+ "Condition": "False"
2577
+ },
2578
+ {
2579
+ "Model": "R1-Dist.-Llama-70B",
2580
+ "Type": "Large Language Model",
2581
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2582
+ "Score": 100.0,
2583
+ "Condition": "False"
2584
+ },
2585
+ {
2586
+ "Model": "R1-Dist.Qwen-14B",
2587
+ "Type": "Large Language Model",
2588
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2589
+ "Score": 99.8,
2590
+ "Condition": "False"
2591
+ },
2592
+ {
2593
+ "Model": "Llama 3.3-70B Inst. Turbo",
2594
+ "Type": "Large Language Model",
2595
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2596
+ "Score": 100.0,
2597
+ "Condition": "False"
2598
+ },
2599
+ {
2600
+ "Model": "GPT-4",
2601
+ "Type": "Large Language Model",
2602
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2603
+ "Score": 94.6,
2604
+ "Condition": "False"
2605
+ },
2606
+ {
2607
+ "Model": "GPT-3.5",
2608
+ "Type": "Large Language Model",
2609
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2610
+ "Score": 79.2,
2611
+ "Condition": "False"
2612
+ },
2613
+ {
2614
+ "Model": "3 Opus",
2615
+ "Type": "Large Language Model",
2616
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2617
+ "Score": 91.4,
2618
+ "Condition": "False"
2619
+ },
2620
+ {
2621
+ "Model": "3 Sonnet",
2622
+ "Type": "Large Language Model",
2623
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2624
+ "Score": 92.8,
2625
+ "Condition": "False"
2626
+ },
2627
+ {
2628
+ "Model": "3 Haiku",
2629
+ "Type": "Large Language Model",
2630
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2631
+ "Score": 93.0,
2632
+ "Condition": "False"
2633
+ },
2634
+ {
2635
+ "Model": "Mixtral 8x22B",
2636
+ "Type": "Large Language Model",
2637
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2638
+ "Score": 95.6,
2639
+ "Condition": "False"
2640
+ },
2641
+ {
2642
+ "Model": "Mixtral 8x7B",
2643
+ "Type": "Large Language Model",
2644
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2645
+ "Score": 91.8,
2646
+ "Condition": "False"
2647
+ },
2648
+ {
2649
+ "Model": "Mixtral 7B",
2650
+ "Type": "Large Language Model",
2651
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2652
+ "Score": 84.8,
2653
+ "Condition": "False"
2654
+ },
2655
+ {
2656
+ "Model": "Llama-3 70B",
2657
+ "Type": "Large Language Model",
2658
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2659
+ "Score": 93.6,
2660
+ "Condition": "False"
2661
+ },
2662
+ {
2663
+ "Model": "Llama-3 8B",
2664
+ "Type": "Large Language Model",
2665
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2666
+ "Score": 93.6,
2667
+ "Condition": "False"
2668
+ },
2669
+ {
2670
+ "Model": "Llama-2 70B",
2671
+ "Type": "Large Language Model",
2672
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2673
+ "Score": 92.8,
2674
+ "Condition": "False"
2675
+ },
2676
+ {
2677
+ "Model": "Llama-2 13B",
2678
+ "Type": "Large Language Model",
2679
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2680
+ "Score": 92.8,
2681
+ "Condition": "False"
2682
+ },
2683
+ {
2684
+ "Model": "Llama-2 7B",
2685
+ "Type": "Large Language Model",
2686
+ "Dataset": "Corr. Attrib. of Belief (JM)",
2687
+ "Score": 93.0,
2688
+ "Condition": "False"
2689
+ },
2690
+ {
2691
+ "Model": "o1",
2692
+ "Type": "Large Language Model",
2693
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2694
+ "Score": 100.0,
2695
+ "Condition": "True"
2696
+ },
2697
+ {
2698
+ "Model": "o3-mini",
2699
+ "Type": "Large Language Model",
2700
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2701
+ "Score": 100.0,
2702
+ "Condition": "True"
2703
+ },
2704
+ {
2705
+ "Model": "4o",
2706
+ "Type": "Large Language Model",
2707
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2708
+ "Score": 99.4,
2709
+ "Condition": "True"
2710
+ },
2711
+ {
2712
+ "Model": "3.7 Sonnet",
2713
+ "Type": "Large Language Model",
2714
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2715
+ "Score": 100.0,
2716
+ "Condition": "True"
2717
+ },
2718
+ {
2719
+ "Model": "3.5 Sonnet",
2720
+ "Type": "Large Language Model",
2721
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2722
+ "Score": 100.0,
2723
+ "Condition": "True"
2724
+ },
2725
+ {
2726
+ "Model": "Gemini 2 Flash",
2727
+ "Type": "Large Language Model",
2728
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2729
+ "Score": 100.0,
2730
+ "Condition": "True"
2731
+ },
2732
+ {
2733
+ "Model": "Gemini 2 Flash-Lite",
2734
+ "Type": "Large Language Model",
2735
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2736
+ "Score": 100.0,
2737
+ "Condition": "True"
2738
+ },
2739
+ {
2740
+ "Model": "R1",
2741
+ "Type": "Large Language Model",
2742
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2743
+ "Score": 100.0,
2744
+ "Condition": "True"
2745
+ },
2746
+ {
2747
+ "Model": "R1-Dist.-Llama-70B",
2748
+ "Type": "Large Language Model",
2749
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2750
+ "Score": 100.0,
2751
+ "Condition": "True"
2752
+ },
2753
+ {
2754
+ "Model": "R1-Dist.Qwen-14B",
2755
+ "Type": "Large Language Model",
2756
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2757
+ "Score": 99.8,
2758
+ "Condition": "True"
2759
+ },
2760
+ {
2761
+ "Model": "Llama 3.3-70B Inst. Turbo",
2762
+ "Type": "Large Language Model",
2763
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2764
+ "Score": 100.0,
2765
+ "Condition": "True"
2766
+ },
2767
+ {
2768
+ "Model": "GPT-4",
2769
+ "Type": "Large Language Model",
2770
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2771
+ "Score": 98.6,
2772
+ "Condition": "True"
2773
+ },
2774
+ {
2775
+ "Model": "GPT-3.5",
2776
+ "Type": "Large Language Model",
2777
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2778
+ "Score": 96.6,
2779
+ "Condition": "True"
2780
+ },
2781
+ {
2782
+ "Model": "3 Opus",
2783
+ "Type": "Large Language Model",
2784
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2785
+ "Score": 97.0,
2786
+ "Condition": "True"
2787
+ },
2788
+ {
2789
+ "Model": "3 Sonnet",
2790
+ "Type": "Large Language Model",
2791
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2792
+ "Score": 97.8,
2793
+ "Condition": "True"
2794
+ },
2795
+ {
2796
+ "Model": "3 Haiku",
2797
+ "Type": "Large Language Model",
2798
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2799
+ "Score": 98.0,
2800
+ "Condition": "True"
2801
+ },
2802
+ {
2803
+ "Model": "Mixtral 8x22B",
2804
+ "Type": "Large Language Model",
2805
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2806
+ "Score": 98.0,
2807
+ "Condition": "True"
2808
+ },
2809
+ {
2810
+ "Model": "Mixtral 8x7B",
2811
+ "Type": "Large Language Model",
2812
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2813
+ "Score": 95.6,
2814
+ "Condition": "True"
2815
+ },
2816
+ {
2817
+ "Model": "Mixtral 7B",
2818
+ "Type": "Large Language Model",
2819
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2820
+ "Score": 35.0,
2821
+ "Condition": "True"
2822
+ },
2823
+ {
2824
+ "Model": "Llama-3 70B",
2825
+ "Type": "Large Language Model",
2826
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2827
+ "Score": 96.6,
2828
+ "Condition": "True"
2829
+ },
2830
+ {
2831
+ "Model": "Llama-3 8B",
2832
+ "Type": "Large Language Model",
2833
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2834
+ "Score": 95.0,
2835
+ "Condition": "True"
2836
+ },
2837
+ {
2838
+ "Model": "Llama-2 70B",
2839
+ "Type": "Large Language Model",
2840
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2841
+ "Score": 96.0,
2842
+ "Condition": "True"
2843
+ },
2844
+ {
2845
+ "Model": "Llama-2 13B",
2846
+ "Type": "Large Language Model",
2847
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2848
+ "Score": 94.8,
2849
+ "Condition": "True"
2850
+ },
2851
+ {
2852
+ "Model": "Llama-2 7B",
2853
+ "Type": "Large Language Model",
2854
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2855
+ "Score": 95.2,
2856
+ "Condition": "True"
2857
+ },
2858
+ {
2859
+ "Model": "o1",
2860
+ "Type": "Large Language Model",
2861
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2862
+ "Score": 100.0,
2863
+ "Condition": "False"
2864
+ },
2865
+ {
2866
+ "Model": "o3-mini",
2867
+ "Type": "Large Language Model",
2868
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2869
+ "Score": 100.0,
2870
+ "Condition": "False"
2871
+ },
2872
+ {
2873
+ "Model": "4o",
2874
+ "Type": "Large Language Model",
2875
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2876
+ "Score": 93.4,
2877
+ "Condition": "False"
2878
+ },
2879
+ {
2880
+ "Model": "3.7 Sonnet",
2881
+ "Type": "Large Language Model",
2882
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2883
+ "Score": 100.0,
2884
+ "Condition": "False"
2885
+ },
2886
+ {
2887
+ "Model": "3.5 Sonnet",
2888
+ "Type": "Large Language Model",
2889
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2890
+ "Score": 100.0,
2891
+ "Condition": "False"
2892
+ },
2893
+ {
2894
+ "Model": "Gemini 2 Flash",
2895
+ "Type": "Large Language Model",
2896
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2897
+ "Score": 100.0,
2898
+ "Condition": "False"
2899
+ },
2900
+ {
2901
+ "Model": "Gemini 2 Flash-Lite",
2902
+ "Type": "Large Language Model",
2903
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2904
+ "Score": 100.0,
2905
+ "Condition": "False"
2906
+ },
2907
+ {
2908
+ "Model": "R1",
2909
+ "Type": "Large Language Model",
2910
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2911
+ "Score": 100.0,
2912
+ "Condition": "False"
2913
+ },
2914
+ {
2915
+ "Model": "R1-Dist.-Llama-70B",
2916
+ "Type": "Large Language Model",
2917
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2918
+ "Score": 99.8,
2919
+ "Condition": "False"
2920
+ },
2921
+ {
2922
+ "Model": "R1-Dist.Qwen-14B",
2923
+ "Type": "Large Language Model",
2924
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2925
+ "Score": 99.6,
2926
+ "Condition": "False"
2927
+ },
2928
+ {
2929
+ "Model": "Llama 3.3-70B Inst. Turbo",
2930
+ "Type": "Large Language Model",
2931
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2932
+ "Score": 100.0,
2933
+ "Condition": "False"
2934
+ },
2935
+ {
2936
+ "Model": "GPT-4",
2937
+ "Type": "Large Language Model",
2938
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2939
+ "Score": 94.0,
2940
+ "Condition": "False"
2941
+ },
2942
+ {
2943
+ "Model": "GPT-3.5",
2944
+ "Type": "Large Language Model",
2945
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2946
+ "Score": 84.8,
2947
+ "Condition": "False"
2948
+ },
2949
+ {
2950
+ "Model": "3 Opus",
2951
+ "Type": "Large Language Model",
2952
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2953
+ "Score": 91.4,
2954
+ "Condition": "False"
2955
+ },
2956
+ {
2957
+ "Model": "3 Sonnet",
2958
+ "Type": "Large Language Model",
2959
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2960
+ "Score": 93.0,
2961
+ "Condition": "False"
2962
+ },
2963
+ {
2964
+ "Model": "3 Haiku",
2965
+ "Type": "Large Language Model",
2966
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2967
+ "Score": 93.0,
2968
+ "Condition": "False"
2969
+ },
2970
+ {
2971
+ "Model": "Mixtral 8x22B",
2972
+ "Type": "Large Language Model",
2973
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2974
+ "Score": 95.0,
2975
+ "Condition": "False"
2976
+ },
2977
+ {
2978
+ "Model": "Mixtral 8x7B",
2979
+ "Type": "Large Language Model",
2980
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2981
+ "Score": 90.4,
2982
+ "Condition": "False"
2983
+ },
2984
+ {
2985
+ "Model": "Mixtral 7B",
2986
+ "Type": "Large Language Model",
2987
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2988
+ "Score": 26.2,
2989
+ "Condition": "False"
2990
+ },
2991
+ {
2992
+ "Model": "Llama-3 70B",
2993
+ "Type": "Large Language Model",
2994
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
2995
+ "Score": 93.6,
2996
+ "Condition": "False"
2997
+ },
2998
+ {
2999
+ "Model": "Llama-3 8B",
3000
+ "Type": "Large Language Model",
3001
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
3002
+ "Score": 88.8,
3003
+ "Condition": "False"
3004
+ },
3005
+ {
3006
+ "Model": "Llama-2 70B",
3007
+ "Type": "Large Language Model",
3008
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
3009
+ "Score": 92.8,
3010
+ "Condition": "False"
3011
+ },
3012
+ {
3013
+ "Model": "Llama-2 13B",
3014
+ "Type": "Large Language Model",
3015
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
3016
+ "Score": 88.6,
3017
+ "Condition": "False"
3018
+ },
3019
+ {
3020
+ "Model": "Llama-2 7B",
3021
+ "Type": "Large Language Model",
3022
+ "Dataset": "Corr. Attrib. of Belief (MJ)",
3023
+ "Score": 88.6,
3024
+ "Condition": "False"
3025
+ },
3026
+ {
3027
+ "Model": "o1",
3028
+ "Type": "Large Language Model",
3029
+ "Dataset": "Ver. of Rec. Knowledge",
3030
+ "Score": 96.0,
3031
+ "Condition": "True"
3032
+ },
3033
+ {
3034
+ "Model": "o3-mini",
3035
+ "Type": "Large Language Model",
3036
+ "Dataset": "Ver. of Rec. Knowledge",
3037
+ "Score": 93.6,
3038
+ "Condition": "True"
3039
+ },
3040
+ {
3041
+ "Model": "4o",
3042
+ "Type": "Large Language Model",
3043
+ "Dataset": "Ver. of Rec. Knowledge",
3044
+ "Score": 95.0,
3045
+ "Condition": "True"
3046
+ },
3047
+ {
3048
+ "Model": "3.7 Sonnet",
3049
+ "Type": "Large Language Model",
3050
+ "Dataset": "Ver. of Rec. Knowledge",
3051
+ "Score": 34.2,
3052
+ "Condition": "True"
3053
+ },
3054
+ {
3055
+ "Model": "3.5 Sonnet",
3056
+ "Type": "Large Language Model",
3057
+ "Dataset": "Ver. of Rec. Knowledge",
3058
+ "Score": 35.8,
3059
+ "Condition": "True"
3060
+ },
3061
+ {
3062
+ "Model": "Gemini 2 Flash",
3063
+ "Type": "Large Language Model",
3064
+ "Dataset": "Ver. of Rec. Knowledge",
3065
+ "Score": 97.2,
3066
+ "Condition": "True"
3067
+ },
3068
+ {
3069
+ "Model": "Gemini 2 Flash-Lite",
3070
+ "Type": "Large Language Model",
3071
+ "Dataset": "Ver. of Rec. Knowledge",
3072
+ "Score": 91.8,
3073
+ "Condition": "True"
3074
+ },
3075
+ {
3076
+ "Model": "R1",
3077
+ "Type": "Large Language Model",
3078
+ "Dataset": "Ver. of Rec. Knowledge",
3079
+ "Score": 90.6,
3080
+ "Condition": "True"
3081
+ },
3082
+ {
3083
+ "Model": "R1-Dist.-Llama-70B",
3084
+ "Type": "Large Language Model",
3085
+ "Dataset": "Ver. of Rec. Knowledge",
3086
+ "Score": 94.2,
3087
+ "Condition": "True"
3088
+ },
3089
+ {
3090
+ "Model": "R1-Dist.Qwen-14B",
3091
+ "Type": "Large Language Model",
3092
+ "Dataset": "Ver. of Rec. Knowledge",
3093
+ "Score": 81.6,
3094
+ "Condition": "True"
3095
+ },
3096
+ {
3097
+ "Model": "Llama 3.3-70B Inst. Turbo",
3098
+ "Type": "Large Language Model",
3099
+ "Dataset": "Ver. of Rec. Knowledge",
3100
+ "Score": 96.0,
3101
+ "Condition": "True"
3102
+ },
3103
+ {
3104
+ "Model": "GPT-4",
3105
+ "Type": "Large Language Model",
3106
+ "Dataset": "Ver. of Rec. Knowledge",
3107
+ "Score": 88.4,
3108
+ "Condition": "True"
3109
+ },
3110
+ {
3111
+ "Model": "GPT-3.5",
3112
+ "Type": "Large Language Model",
3113
+ "Dataset": "Ver. of Rec. Knowledge",
3114
+ "Score": 94.8,
3115
+ "Condition": "True"
3116
+ },
3117
+ {
3118
+ "Model": "3 Opus",
3119
+ "Type": "Large Language Model",
3120
+ "Dataset": "Ver. of Rec. Knowledge",
3121
+ "Score": 66.4,
3122
+ "Condition": "True"
3123
+ },
3124
+ {
3125
+ "Model": "3 Sonnet",
3126
+ "Type": "Large Language Model",
3127
+ "Dataset": "Ver. of Rec. Knowledge",
3128
+ "Score": 30.6,
3129
+ "Condition": "True"
3130
+ },
3131
+ {
3132
+ "Model": "3 Haiku",
3133
+ "Type": "Large Language Model",
3134
+ "Dataset": "Ver. of Rec. Knowledge",
3135
+ "Score": 87.0,
3136
+ "Condition": "True"
3137
+ },
3138
+ {
3139
+ "Model": "Mixtral 8x22B",
3140
+ "Type": "Large Language Model",
3141
+ "Dataset": "Ver. of Rec. Knowledge",
3142
+ "Score": 93.2,
3143
+ "Condition": "True"
3144
+ },
3145
+ {
3146
+ "Model": "Mixtral 8x7B",
3147
+ "Type": "Large Language Model",
3148
+ "Dataset": "Ver. of Rec. Knowledge",
3149
+ "Score": 90.8,
3150
+ "Condition": "True"
3151
+ },
3152
+ {
3153
+ "Model": "Mixtral 7B",
3154
+ "Type": "Large Language Model",
3155
+ "Dataset": "Ver. of Rec. Knowledge",
3156
+ "Score": 89.2,
3157
+ "Condition": "True"
3158
+ },
3159
+ {
3160
+ "Model": "Llama-3 70B",
3161
+ "Type": "Large Language Model",
3162
+ "Dataset": "Ver. of Rec. Knowledge",
3163
+ "Score": 81.8,
3164
+ "Condition": "True"
3165
+ },
3166
+ {
3167
+ "Model": "Llama-3 8B",
3168
+ "Type": "Large Language Model",
3169
+ "Dataset": "Ver. of Rec. Knowledge",
3170
+ "Score": 82.8,
3171
+ "Condition": "True"
3172
+ },
3173
+ {
3174
+ "Model": "Llama-2 70B",
3175
+ "Type": "Large Language Model",
3176
+ "Dataset": "Ver. of Rec. Knowledge",
3177
+ "Score": 79.4,
3178
+ "Condition": "True"
3179
+ },
3180
+ {
3181
+ "Model": "Llama-2 13B",
3182
+ "Type": "Large Language Model",
3183
+ "Dataset": "Ver. of Rec. Knowledge",
3184
+ "Score": 81.2,
3185
+ "Condition": "True"
3186
+ },
3187
+ {
3188
+ "Model": "Llama-2 7B",
3189
+ "Type": "Large Language Model",
3190
+ "Dataset": "Ver. of Rec. Knowledge",
3191
+ "Score": 80.2,
3192
+ "Condition": "True"
3193
+ },
3194
+ {
3195
+ "Model": "o1",
3196
+ "Type": "Large Language Model",
3197
+ "Dataset": "Conf. of Rec. Knowledge",
3198
+ "Score": 100.0,
3199
+ "Condition": "True"
3200
+ },
3201
+ {
3202
+ "Model": "o3-mini",
3203
+ "Type": "Large Language Model",
3204
+ "Dataset": "Conf. of Rec. Knowledge",
3205
+ "Score": 100.0,
3206
+ "Condition": "True"
3207
+ },
3208
+ {
3209
+ "Model": "4o",
3210
+ "Type": "Large Language Model",
3211
+ "Dataset": "Conf. of Rec. Knowledge",
3212
+ "Score": 99.4,
3213
+ "Condition": "True"
3214
+ },
3215
+ {
3216
+ "Model": "3.7 Sonnet",
3217
+ "Type": "Large Language Model",
3218
+ "Dataset": "Conf. of Rec. Knowledge",
3219
+ "Score": 100.0,
3220
+ "Condition": "True"
3221
+ },
3222
+ {
3223
+ "Model": "3.5 Sonnet",
3224
+ "Type": "Large Language Model",
3225
+ "Dataset": "Conf. of Rec. Knowledge",
3226
+ "Score": 99.4,
3227
+ "Condition": "True"
3228
+ },
3229
+ {
3230
+ "Model": "Gemini 2 Flash",
3231
+ "Type": "Large Language Model",
3232
+ "Dataset": "Conf. of Rec. Knowledge",
3233
+ "Score": 100.0,
3234
+ "Condition": "True"
3235
+ },
3236
+ {
3237
+ "Model": "Gemini 2 Flash-Lite",
3238
+ "Type": "Large Language Model",
3239
+ "Dataset": "Conf. of Rec. Knowledge",
3240
+ "Score": 100.0,
3241
+ "Condition": "True"
3242
+ },
3243
+ {
3244
+ "Model": "R1",
3245
+ "Type": "Large Language Model",
3246
+ "Dataset": "Conf. of Rec. Knowledge",
3247
+ "Score": 100.0,
3248
+ "Condition": "True"
3249
+ },
3250
+ {
3251
+ "Model": "R1-Dist.-Llama-70B",
3252
+ "Type": "Large Language Model",
3253
+ "Dataset": "Conf. of Rec. Knowledge",
3254
+ "Score": 100.0,
3255
+ "Condition": "True"
3256
+ },
3257
+ {
3258
+ "Model": "R1-Dist.Qwen-14B",
3259
+ "Type": "Large Language Model",
3260
+ "Dataset": "Conf. of Rec. Knowledge",
3261
+ "Score": 100.0,
3262
+ "Condition": "True"
3263
+ },
3264
+ {
3265
+ "Model": "Llama 3.3-70B Inst. Turbo",
3266
+ "Type": "Large Language Model",
3267
+ "Dataset": "Conf. of Rec. Knowledge",
3268
+ "Score": 100.0,
3269
+ "Condition": "True"
3270
+ },
3271
+ {
3272
+ "Model": "GPT-4",
3273
+ "Type": "Large Language Model",
3274
+ "Dataset": "Conf. of Rec. Knowledge",
3275
+ "Score": 98.6,
3276
+ "Condition": "True"
3277
+ },
3278
+ {
3279
+ "Model": "GPT-3.5",
3280
+ "Type": "Large Language Model",
3281
+ "Dataset": "Conf. of Rec. Knowledge",
3282
+ "Score": 90.6,
3283
+ "Condition": "True"
3284
+ },
3285
+ {
3286
+ "Model": "3 Opus",
3287
+ "Type": "Large Language Model",
3288
+ "Dataset": "Conf. of Rec. Knowledge",
3289
+ "Score": 96.6,
3290
+ "Condition": "True"
3291
+ },
3292
+ {
3293
+ "Model": "3 Sonnet",
3294
+ "Type": "Large Language Model",
3295
+ "Dataset": "Conf. of Rec. Knowledge",
3296
+ "Score": 78.8,
3297
+ "Condition": "True"
3298
+ },
3299
+ {
3300
+ "Model": "3 Haiku",
3301
+ "Type": "Large Language Model",
3302
+ "Dataset": "Conf. of Rec. Knowledge",
3303
+ "Score": 95.0,
3304
+ "Condition": "True"
3305
+ },
3306
+ {
3307
+ "Model": "Mixtral 8x22B",
3308
+ "Type": "Large Language Model",
3309
+ "Dataset": "Conf. of Rec. Knowledge",
3310
+ "Score": 97.6,
3311
+ "Condition": "True"
3312
+ },
3313
+ {
3314
+ "Model": "Mixtral 8x7B",
3315
+ "Type": "Large Language Model",
3316
+ "Dataset": "Conf. of Rec. Knowledge",
3317
+ "Score": 83.0,
3318
+ "Condition": "True"
3319
+ },
3320
+ {
3321
+ "Model": "Mixtral 7B",
3322
+ "Type": "Large Language Model",
3323
+ "Dataset": "Conf. of Rec. Knowledge",
3324
+ "Score": 62.2,
3325
+ "Condition": "True"
3326
+ },
3327
+ {
3328
+ "Model": "Llama-3 70B",
3329
+ "Type": "Large Language Model",
3330
+ "Dataset": "Conf. of Rec. Knowledge",
3331
+ "Score": 96.4,
3332
+ "Condition": "True"
3333
+ },
3334
+ {
3335
+ "Model": "Llama-3 8B",
3336
+ "Type": "Large Language Model",
3337
+ "Dataset": "Conf. of Rec. Knowledge",
3338
+ "Score": 69.6,
3339
+ "Condition": "True"
3340
+ },
3341
+ {
3342
+ "Model": "Llama-2 70B",
3343
+ "Type": "Large Language Model",
3344
+ "Dataset": "Conf. of Rec. Knowledge",
3345
+ "Score": 96.2,
3346
+ "Condition": "True"
3347
+ },
3348
+ {
3349
+ "Model": "Llama-2 13B",
3350
+ "Type": "Large Language Model",
3351
+ "Dataset": "Conf. of Rec. Knowledge",
3352
+ "Score": 68.6,
3353
+ "Condition": "True"
3354
+ },
3355
+ {
3356
+ "Model": "Llama-2 7B",
3357
+ "Type": "Large Language Model",
3358
+ "Dataset": "Conf. of Rec. Knowledge",
3359
+ "Score": 68.6,
3360
+ "Condition": "True"
3361
+ },
3362
+ {
3363
+ "Model": "o1",
3364
+ "Type": "Large Language Model",
3365
+ "Dataset": "Awrn. of Rec. Knowledge",
3366
+ "Score": 99.8,
3367
+ "Condition": "True"
3368
+ },
3369
+ {
3370
+ "Model": "o3-mini",
3371
+ "Type": "Large Language Model",
3372
+ "Dataset": "Awrn. of Rec. Knowledge",
3373
+ "Score": 98.4,
3374
+ "Condition": "True"
3375
+ },
3376
+ {
3377
+ "Model": "4o",
3378
+ "Type": "Large Language Model",
3379
+ "Dataset": "Awrn. of Rec. Knowledge",
3380
+ "Score": 99.6,
3381
+ "Condition": "True"
3382
+ },
3383
+ {
3384
+ "Model": "3.7 Sonnet",
3385
+ "Type": "Large Language Model",
3386
+ "Dataset": "Awrn. of Rec. Knowledge",
3387
+ "Score": 96.0,
3388
+ "Condition": "True"
3389
+ },
3390
+ {
3391
+ "Model": "3.5 Sonnet",
3392
+ "Type": "Large Language Model",
3393
+ "Dataset": "Awrn. of Rec. Knowledge",
3394
+ "Score": 100.0,
3395
+ "Condition": "True"
3396
+ },
3397
+ {
3398
+ "Model": "Gemini 2 Flash",
3399
+ "Type": "Large Language Model",
3400
+ "Dataset": "Awrn. of Rec. Knowledge",
3401
+ "Score": 100.0,
3402
+ "Condition": "True"
3403
+ },
3404
+ {
3405
+ "Model": "Gemini 2 Flash-Lite",
3406
+ "Type": "Large Language Model",
3407
+ "Dataset": "Awrn. of Rec. Knowledge",
3408
+ "Score": 100.0,
3409
+ "Condition": "True"
3410
+ },
3411
+ {
3412
+ "Model": "R1",
3413
+ "Type": "Large Language Model",
3414
+ "Dataset": "Awrn. of Rec. Knowledge",
3415
+ "Score": 83.2,
3416
+ "Condition": "True"
3417
+ },
3418
+ {
3419
+ "Model": "R1-Dist.-Llama-70B",
3420
+ "Type": "Large Language Model",
3421
+ "Dataset": "Awrn. of Rec. Knowledge",
3422
+ "Score": 93.6,
3423
+ "Condition": "True"
3424
+ },
3425
+ {
3426
+ "Model": "R1-Dist.Qwen-14B",
3427
+ "Type": "Large Language Model",
3428
+ "Dataset": "Awrn. of Rec. Knowledge",
3429
+ "Score": 88.2,
3430
+ "Condition": "True"
3431
+ },
3432
+ {
3433
+ "Model": "Llama 3.3-70B Inst. Turbo",
3434
+ "Type": "Large Language Model",
3435
+ "Dataset": "Awrn. of Rec. Knowledge",
3436
+ "Score": 97.0,
3437
+ "Condition": "True"
3438
+ },
3439
+ {
3440
+ "Model": "GPT-4",
3441
+ "Type": "Large Language Model",
3442
+ "Dataset": "Awrn. of Rec. Knowledge",
3443
+ "Score": 98.6,
3444
+ "Condition": "True"
3445
+ },
3446
+ {
3447
+ "Model": "GPT-3.5",
3448
+ "Type": "Large Language Model",
3449
+ "Dataset": "Awrn. of Rec. Knowledge",
3450
+ "Score": 65.6,
3451
+ "Condition": "True"
3452
+ },
3453
+ {
3454
+ "Model": "3 Opus",
3455
+ "Type": "Large Language Model",
3456
+ "Dataset": "Awrn. of Rec. Knowledge",
3457
+ "Score": 97.2,
3458
+ "Condition": "True"
3459
+ },
3460
+ {
3461
+ "Model": "3 Sonnet",
3462
+ "Type": "Large Language Model",
3463
+ "Dataset": "Awrn. of Rec. Knowledge",
3464
+ "Score": 98.4,
3465
+ "Condition": "True"
3466
+ },
3467
+ {
3468
+ "Model": "3 Haiku",
3469
+ "Type": "Large Language Model",
3470
+ "Dataset": "Awrn. of Rec. Knowledge",
3471
+ "Score": 74.6,
3472
+ "Condition": "True"
3473
+ },
3474
+ {
3475
+ "Model": "Mixtral 8x22B",
3476
+ "Type": "Large Language Model",
3477
+ "Dataset": "Awrn. of Rec. Knowledge",
3478
+ "Score": 94.4,
3479
+ "Condition": "True"
3480
+ },
3481
+ {
3482
+ "Model": "Mixtral 8x7B",
3483
+ "Type": "Large Language Model",
3484
+ "Dataset": "Awrn. of Rec. Knowledge",
3485
+ "Score": 88.2,
3486
+ "Condition": "True"
3487
+ },
3488
+ {
3489
+ "Model": "Mixtral 7B",
3490
+ "Type": "Large Language Model",
3491
+ "Dataset": "Awrn. of Rec. Knowledge",
3492
+ "Score": 96.0,
3493
+ "Condition": "True"
3494
+ },
3495
+ {
3496
+ "Model": "Llama-3 70B",
3497
+ "Type": "Large Language Model",
3498
+ "Dataset": "Awrn. of Rec. Knowledge",
3499
+ "Score": 96.8,
3500
+ "Condition": "True"
3501
+ },
3502
+ {
3503
+ "Model": "Llama-3 8B",
3504
+ "Type": "Large Language Model",
3505
+ "Dataset": "Awrn. of Rec. Knowledge",
3506
+ "Score": 80.4,
3507
+ "Condition": "True"
3508
+ },
3509
+ {
3510
+ "Model": "Llama-2 70B",
3511
+ "Type": "Large Language Model",
3512
+ "Dataset": "Awrn. of Rec. Knowledge",
3513
+ "Score": 96.8,
3514
+ "Condition": "True"
3515
+ },
3516
+ {
3517
+ "Model": "Llama-2 13B",
3518
+ "Type": "Large Language Model",
3519
+ "Dataset": "Awrn. of Rec. Knowledge",
3520
+ "Score": 79.0,
3521
+ "Condition": "True"
3522
+ },
3523
+ {
3524
+ "Model": "Llama-2 7B",
3525
+ "Type": "Large Language Model",
3526
+ "Dataset": "Awrn. of Rec. Knowledge",
3527
+ "Score": 80.0,
3528
+ "Condition": "True"
3529
+ }
3530
+ ]
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=5.0.0
2
+ pandas>=1.5.0
3
+ matplotlib>=3.5.0
4
+ Pillow>=9.0.0