diff --git a/constants.py b/constants.py index c958b88f29966f892a92ccad613ba0c92adff63c..06f7ba8d215bb31d379ac6da98656e8571432bec 100644 --- a/constants.py +++ b/constants.py @@ -116,6 +116,7 @@ MODEL_NAME_MAP = { "InternVL2_5_78B": "InternVL2.5-78B", "InternVL2_5_2B": "InternVL2.5-2B", "InternVL2_5_8B": "InternVL2.5-8B", + "Grok-2-vision-1212": "Grok-2-vision-1212", } DIMENSION_NAME_MAP = { @@ -203,14 +204,15 @@ MODEL_URLS = { "InternVL2_5_78B": "https://huggingface.co/OpenGVLab/InternVL2_5-78B", "InternVL2_5_2B": "https://huggingface.co/OpenGVLab/InternVL2_5-2B", "InternVL2_5_8B": "https://huggingface.co/OpenGVLab/InternVL2_5-8B", + "Grok-2-vision-1212": "https://x.ai/blog/grok-1212", } # Define the base MODEL_GROUPS structure BASE_MODEL_GROUPS = { "All": list(MODEL_NAME_MAP.keys()), - "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B'], + "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B', 'Grok-2-vision-1212'], "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B"], - "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'], + "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Grok-2-vision-1212'], "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'], "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B"], "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B", "InternVL2_5_8B"] diff --git a/static/eval_results/Default/Aquila_VL_2B/summary_results.json b/static/eval_results/Default/Aquila_VL_2B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Aquila_VL_2B/summary_results.json rename to static/eval_results/Default/Aquila_VL_2B/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/Aria/summary_results.json b/static/eval_results/Default/Aria/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Aria/summary_results.json rename to static/eval_results/Default/Aria/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/Claude_3.5/summary_results.json b/static/eval_results/Default/Claude_3.5/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Claude_3.5/summary_results.json rename to static/eval_results/Default/Claude_3.5/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/Claude_3.5_new/summary_results.json b/static/eval_results/Default/Claude_3.5_new/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Claude_3.5_new/summary_results.json rename to static/eval_results/Default/Claude_3.5_new/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/GPT_4o/summary_results.json b/static/eval_results/Default/GPT_4o/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/GPT_4o/summary_results.json rename to static/eval_results/Default/GPT_4o/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/GPT_4o_mini/summary_results.json b/static/eval_results/Default/GPT_4o_mini/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/GPT_4o_mini/summary_results.json rename to static/eval_results/Default/GPT_4o_mini/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json b/static/eval_results/Default/Gemini_1.5_flash_002/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json rename to static/eval_results/Default/Gemini_1.5_flash_002/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json b/static/eval_results/Default/Gemini_1.5_pro_002/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json rename to static/eval_results/Default/Gemini_1.5_pro_002/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/Idefics3/summary_results.json b/static/eval_results/Default/Idefics3/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Idefics3/summary_results.json rename to static/eval_results/Default/Idefics3/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/InternVL2_2B/summary_results.json b/static/eval_results/Default/InternVL2_2B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/InternVL2_2B/summary_results.json rename to static/eval_results/Default/InternVL2_2B/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/InternVL2_5_2B/summary_results.json b/static/eval_results/Default/InternVL2_5_2B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/InternVL2_5_2B/summary_results.json rename to static/eval_results/Default/InternVL2_5_2B/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/InternVL2_5_78B/summary_results.json b/static/eval_results/Default/InternVL2_5_78B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/InternVL2_5_78B/summary_results.json rename to static/eval_results/Default/InternVL2_5_78B/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/InternVL2_5_8B/summary_results.json b/static/eval_results/Default/InternVL2_5_8B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/InternVL2_5_8B/summary_results.json rename to static/eval_results/Default/InternVL2_5_8B/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/InternVL2_76B/summary_results.json b/static/eval_results/Default/InternVL2_76B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/InternVL2_76B/summary_results.json rename to static/eval_results/Default/InternVL2_76B/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/InternVL2_8B/summary_results.json b/static/eval_results/Default/InternVL2_8B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/InternVL2_8B/summary_results.json rename to static/eval_results/Default/InternVL2_8B/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/Llama_3_2_11B/summary_results.json b/static/eval_results/Default/Llama_3_2_11B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Llama_3_2_11B/summary_results.json rename to static/eval_results/Default/Llama_3_2_11B/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/Mammoth_VL/summary_results.json b/static/eval_results/Default/Mammoth_VL/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Mammoth_VL/summary_results.json rename to static/eval_results/Default/Mammoth_VL/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/MiniCPM_v2.6/summary_results.json b/static/eval_results/Default/MiniCPM_v2.6/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/MiniCPM_v2.6/summary_results.json rename to static/eval_results/Default/MiniCPM_v2.6/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/NVLM/summary_results.json b/static/eval_results/Default/NVLM/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/NVLM/summary_results.json rename to static/eval_results/Default/NVLM/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/Phi-3.5-vision/summary_results.json b/static/eval_results/Default/Phi-3.5-vision/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Phi-3.5-vision/summary_results.json rename to static/eval_results/Default/Phi-3.5-vision/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/Pixtral_12B/summary_results.json b/static/eval_results/Default/Pixtral_12B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Pixtral_12B/summary_results.json rename to static/eval_results/Default/Pixtral_12B/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/Qwen2_VL_2B/summary_results.json b/static/eval_results/Default/Qwen2_VL_2B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Qwen2_VL_2B/summary_results.json rename to static/eval_results/Default/Qwen2_VL_2B/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/Qwen2_VL_72B/summary_results.json b/static/eval_results/Default/Qwen2_VL_72B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Qwen2_VL_72B/summary_results.json rename to static/eval_results/Default/Qwen2_VL_72B/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/Qwen2_VL_7B/summary_results.json b/static/eval_results/Default/Qwen2_VL_7B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/Qwen2_VL_7B/summary_results.json rename to static/eval_results/Default/Qwen2_VL_7B/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/llava_onevision_72B/summary_results.json b/static/eval_results/Default/llava_onevision_72B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/llava_onevision_72B/summary_results.json rename to static/eval_results/Default/llava_onevision_72B/summary_and_keyword_stats.json diff --git a/static/eval_results/Default/llava_onevision_7B/summary_results.json b/static/eval_results/Default/llava_onevision_7B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/Default/llava_onevision_7B/summary_results.json rename to static/eval_results/Default/llava_onevision_7B/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Aquila_VL_2B/summary_results.json b/static/eval_results/SI/Aquila_VL_2B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Aquila_VL_2B/summary_results.json rename to static/eval_results/SI/Aquila_VL_2B/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Aria/summary_results.json b/static/eval_results/SI/Aria/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Aria/summary_results.json rename to static/eval_results/SI/Aria/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Claude_3.5/summary_results.json b/static/eval_results/SI/Claude_3.5/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Claude_3.5/summary_results.json rename to static/eval_results/SI/Claude_3.5/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Claude_3.5_new/summary_results.json b/static/eval_results/SI/Claude_3.5_new/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Claude_3.5_new/summary_results.json rename to static/eval_results/SI/Claude_3.5_new/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/GPT_4o/summary_results.json b/static/eval_results/SI/GPT_4o/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/GPT_4o/summary_results.json rename to static/eval_results/SI/GPT_4o/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/GPT_4o_mini/summary_results.json b/static/eval_results/SI/GPT_4o_mini/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/GPT_4o_mini/summary_results.json rename to static/eval_results/SI/GPT_4o_mini/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json b/static/eval_results/SI/Gemini_1.5_flash_002/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json rename to static/eval_results/SI/Gemini_1.5_flash_002/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json b/static/eval_results/SI/Gemini_1.5_pro_002/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json rename to static/eval_results/SI/Gemini_1.5_pro_002/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Grok-2-vision-1212/summary_and_keyword_stats.json b/static/eval_results/SI/Grok-2-vision-1212/summary_and_keyword_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..6d450f00678d1d308acc96d5bee5c24dda6da1c6 --- /dev/null +++ b/static/eval_results/SI/Grok-2-vision-1212/summary_and_keyword_stats.json @@ -0,0 +1,213 @@ +{ + "model_summary": { + "core": { + "num_eval_tasks": 273, + "num_eval_samples": 4108, + "macro_mean_score": 0.4120738315897316 + }, + "open": { + "num_eval_tasks": 42, + "num_eval_samples": 808, + "macro_mean_score": 0.5369427320775519 + }, + "overall_score": 0.42872301832144094 + }, + "keyword_stats": { + "skills": { + "Object Recognition and Classification": { + "count": 172, + "num_samples": 2704, + "tasks": [], + "average_score": 0.466222487838683 + }, + "Language Understanding and Generation": { + "count": 102, + "num_samples": 1707, + "tasks": [], + "average_score": 0.49260409084481493 + }, + "Commonsense and Social Reasoning": { + "count": 38, + "num_samples": 652, + "tasks": [], + "average_score": 0.5513856107049714 + }, + "Scene and Event Understanding": { + "count": 60, + "num_samples": 1004, + "tasks": [], + "average_score": 0.5869208042949662 + }, + "Domain-Specific Knowledge and Skills": { + "count": 46, + "num_samples": 896, + "tasks": [], + "average_score": 0.4815724520339999 + }, + "Ethical and Safety Reasoning": { + "count": 10, + "num_samples": 170, + "tasks": [], + "average_score": 0.6636804511278196 + }, + "Text Recognition (OCR)": { + "count": 101, + "num_samples": 1680, + "tasks": [], + "average_score": 0.3702735127125422 + }, + "Spatial and Temporal Reasoning": { + "count": 78, + "num_samples": 1270, + "tasks": [], + "average_score": 0.33515724252578744 + }, + "Mathematical and Logical Reasoning": { + "count": 91, + "num_samples": 1628, + "tasks": [], + "average_score": 0.38305225927176256 + }, + "Planning and Decision Making": { + "count": 23, + "num_samples": 355, + "tasks": [], + "average_score": 0.10535044936296332 + } + }, + "input_format": { + "Photographs": { + "count": 83, + "num_samples": 1310, + "tasks": [], + "average_score": 0.5618064274876843 + }, + "Artistic and Creative Content": { + "count": 22, + "num_samples": 388, + "tasks": [], + "average_score": 0.5826343022222362 + }, + "Diagrams and Data Visualizations": { + "count": 88, + "num_samples": 1523, + "tasks": [], + "average_score": 0.46202217635966664 + }, + "Text-Based Images and Documents": { + "count": 53, + "num_samples": 847, + "tasks": [], + "average_score": 0.30345879283667027 + }, + "User Interface Screenshots": { + "count": 67, + "num_samples": 1117, + "tasks": [], + "average_score": 0.2751054353728693 + }, + "3D Models and Aerial Imagery": { + "count": 2, + "num_samples": 30, + "tasks": [], + "average_score": 0.21326546545524588 + } + }, + "output_format": { + "contextual_formatted_text": { + "count": 63, + "num_samples": 972, + "tasks": [], + "average_score": 0.36955979847719544 + }, + "open_ended_output": { + "count": 51, + "num_samples": 986, + "tasks": [], + "average_score": 0.531045897627514 + }, + "structured_output": { + "count": 72, + "num_samples": 1120, + "tasks": [], + "average_score": 0.39618293480240524 + }, + "numerical_data": { + "count": 39, + "num_samples": 694, + "tasks": [], + "average_score": 0.4022896767902012 + }, + "multiple_choice": { + "count": 33, + "num_samples": 567, + "tasks": [], + "average_score": 0.5637194455376273 + }, + "exact_text": { + "count": 57, + "num_samples": 876, + "tasks": [], + "average_score": 0.3835953032430645 + } + }, + "input_num": { + "1-image": { + "count": 315, + "num_samples": 5215, + "tasks": [], + "average_score": 0.4287230183214409 + } + }, + "app": { + "Knowledge": { + "count": 77, + "num_samples": 1291, + "tasks": [], + "average_score": 0.5298084871907297 + }, + "Perception": { + "count": 82, + "num_samples": 1318, + "tasks": [], + "average_score": 0.5357263973810524 + }, + "Coding": { + "count": 16, + "num_samples": 244, + "tasks": [], + "average_score": 0.4783708274976657 + }, + "Science": { + "count": 22, + "num_samples": 469, + "tasks": [], + "average_score": 0.4448688427088975 + }, + "Information_Extraction": { + "count": 41, + "num_samples": 639, + "tasks": [], + "average_score": 0.312597090907984 + }, + "Planning": { + "count": 44, + "num_samples": 712, + "tasks": [], + "average_score": 0.18803058075452733 + }, + "Mathematics": { + "count": 30, + "num_samples": 497, + "tasks": [], + "average_score": 0.35624322358581967 + }, + "Metrics": { + "count": 3, + "num_samples": 45, + "tasks": [], + "average_score": 0.3682539682539683 + } + } + } +} \ No newline at end of file diff --git a/static/eval_results/SI/Grok-2-vision-1212/task_results.json b/static/eval_results/SI/Grok-2-vision-1212/task_results.json new file mode 100644 index 0000000000000000000000000000000000000000..16b575458bc62c11b09dda48176630f27ac9f1cb --- /dev/null +++ b/static/eval_results/SI/Grok-2-vision-1212/task_results.json @@ -0,0 +1,2207 @@ +[ + { + "name": "ascii_art_30", + "score": 0.14285714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_explanation", + "score": 0.8600000000000001, + "eval_type": "llm", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "science_figure_explanation", + "score": 0.8448275862068965, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "vibe_eval_phrase", + "score": 0.6357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "traffic_accident_analysis", + "score": 0.39999999999999997, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figurative_speech_explanation", + "score": 0.8517241379310343, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "table2latex_complex", + "score": 0.611111111111111, + "eval_type": "llm", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "unusual_images", + "score": 0.8275862068965517, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "art_explanation", + "score": 0.7689655172413796, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "ocr_open_ended_qa", + "score": 0.8172413793103449, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "bar_chart_interpretation", + "score": 0.5310344827586206, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "scibench_w_solution_open_ended", + "score": 0.39, + "eval_type": "llm", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "GUI_Chat_Hard", + "score": 0.4538461538461539, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "image_humor_understanding", + "score": 0.886206896551724, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "defeasible_reasoning", + "score": 0.8655172413793104, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "funny_image_title", + "score": 0.6357142857142858, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tweets_captioning", + "score": 0.6214285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_interpretation", + "score": 0.7896551724137929, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "meme_explain", + "score": 0.892857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "guess_image_generation_prompt", + "score": 0.7368421052631581, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "visualization_with_code", + "score": 0.5142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "iq_test_open_ended", + "score": 0.42758620689655175, + "eval_type": "llm", + "num_demo": 1, + "num_query": 29 + }, + { + "name": "electrocardiogram", + "score": 0.2785714285714285, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_captioning_with_additional_requirements", + "score": 0.8428571428571432, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "docci_image_description_long", + "score": 0.7142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Chat_Easy", + "score": 0.5923076923076924, + "eval_type": "llm", + "num_demo": 1, + "num_query": 26 + }, + { + "name": "bridge_strategies_advanced", + "score": 0.20714285714285716, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_worldclass", + "score": 0.1928571428571429, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bridge_strategies_expert", + "score": 0.3642857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Spanish", + "score": 0.12857142857142856, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Japanese", + "score": 0.09285714285714286, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_French", + "score": 0.09999999999999999, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Arabic", + "score": 0.13571428571428573, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_Russian", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multi_lingual_Ruozhiba_expalnation_English", + "score": 0.07142857142857142, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fetaqa", + "score": 0.4142857142857143, + "eval_type": "llm", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "red_teaming_celebrity", + "score": 0.8250000000000002, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_captcha", + "score": 0.12105263157894738, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_jailbreak", + "score": 0.575, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_visualmisleading", + "score": 0.8789473684210528, + "eval_type": "llm", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "red_teaming_racial", + "score": 0.7250000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "red_teaming_politics", + "score": 0.7150000000000001, + "eval_type": "llm", + "num_demo": 0, + "num_query": 20 + }, + { + "name": "brand_logo_recognition_and_elaboration", + "score": 0.78, + "eval_type": "rule", + "num_demo": 1, + "num_query": 25 + }, + { + "name": "exchange_rate_estimate_plot", + "score": 0.9693285714285712, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_parity", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "traffic_future_prediction_from_line_plot", + "score": 0.6339473684210527, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "graph_chordless_cycle", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "youtube_video_info_parsing", + "score": 0.1547619047619048, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "super_clevr_scene_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "figureqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_keypoint_detection", + "score": 0.5273791704902433, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "widerface_face_count_and_event_classification", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "average_humidity_estimate_plot", + "score": 0.764, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "weather_info_parsing", + "score": 0.5277777777777779, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_analysis_single_image", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "waybill_number_sequence_extraction", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_maxflow", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "TV_show_info_parsing", + "score": 0.3412698412698412, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "insect_order_classification", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "electricity_plot_future_prediction", + "score": 0.806521052631579, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "chemistry_exams_v", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "finance_table_understanding", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "funsd_document_qa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "vibe_eval_open", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "question_solution_solving", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_theory", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_analytic", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_length", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "algebra", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzle_single_step", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "chess_winner_identification", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "physical_property_reasoning", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "humor_understand_caption_match", + "score": 0.6666666666666666, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "coco_object_detection_by_query_property", + "score": 0.6660750158426613, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_game_info_parsing", + "score": 0.26785714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mnist_pattern", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "dvqa", + "score": 0.9473684210526315, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "physics_exams_v", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "snli_ve_visual_entailment", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "3d_indoor_scene_text_bbox_selection", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_descriptive", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_rated_hotel_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "science_molecule_chemistry", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "game_info_parsing", + "score": 0.5064935064935063, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "deciphering_oracle_bone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signboard_identification", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "image_style_recognition", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_convexity_value_estimation", + "score": 0.5147224146995476, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "3d_indoor_scene_text_bbox_prediction", + "score": 0.14081664519620604, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "movie_info_parsing", + "score": 0.3125, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "human_relationship_reasoning", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "graph_shortest_path_kamada_kawai", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "coco_person_detection", + "score": 0.6271347226619672, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chart_vqa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "nlvr2_two_image_compare_qa", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "math_exams_v", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "newspaper_ocr_in_query_box", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "mvsa_sentiment_classification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "egocentric_spatial_reasoning", + "score": 0.4444444444444444, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "graph_isomorphism", + "score": 0.6, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "code_programming_test_easy", + "score": 0.2708333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 24 + }, + { + "name": "biology_exams_v", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_number_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "kvqa_knowledge_aware_qa", + "score": 0.47368421052631576, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "math_breakpoint", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "landmark_recognition_and_qa", + "score": 0.5555555555555555, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "map_diagram_qa", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pmc_vqa_medical_image_qa", + "score": 0.8421052631578947, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "newspaper_page_parse_and_count", + "score": 0.5777777777777776, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "science_basic_physics", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "electricity_future_prediction_from_table", + "score": 0.568157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "license_plate_recognition", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "places365_scene_type_classification", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "music_info_parsing", + "score": 0.3482142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multilingual_movie_info_parsing", + "score": 0.2040816326530612, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "iconqa_count_and_reasoning", + "score": 0.5263157894736842, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "graph_connectivity", + "score": 0.5833333333333334, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "graph_shortest_path_planar", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "famous_building_recognition", + "score": 0.84375, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "geometry_transformation", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "long_string_letter_recognition", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "handwritten_math_expression_extraction", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_solid", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "animal_pose_estimation", + "score": 0.29270584066940136, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "single_person_pose_estimation", + "score": 0.36770882803106975, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_area", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "hotel_booking_confirmation_parsing", + "score": 0.1, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ili_ratio_future_prediction", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "electricity_load_estimate_plot", + "score": 0.6239285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "tqa_textbook_qa", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_info_parsing", + "score": 0.21428571428571433, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "quizlet_question_solving", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "stock_price_future_prediction", + "score": 0.46871428571428575, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "Ad_count_detection", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "recover_masked_word_in_figure", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "polygon_interior_angles", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_grounding", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "latex_complex_formula_convertion", + "score": 0.35294117647058826, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "transit_map_intersection_points", + "score": 0.2261904761904762, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "arxiv_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_image_artifacts_indentification", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "song_title_identification_from_lyrics", + "score": 0.6071428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "actor_recognition_in_Movie", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "bongard_problem", + "score": 0.21052631578947367, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "ascii_art_understanding", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "calendar_schedule_suggestion", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_overlapped_circle", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_barman", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "planning_screenshot_floortile", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "medical_blood_vessels_recognition", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "location_vqa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mindmap_elements_parsing", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "mensa_iq_test", + "score": 0.3803921568627451, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "flowchart_code_generation", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "stackoverflow_debug_QA", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "logical_reasoning_find_odd_one_out", + "score": 0.03571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "web_action_prediction", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "code_execution", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 16 + }, + { + "name": "music_sheet_format_QA", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "annoying_word_search", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "interpret_force_perspective_illusion", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "healthcare_info_judgement", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_plot_position_relationship", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_depth_of_different_points", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "topological_sort", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_fundamental_wo_solution", + "score": 0.3673469387755102, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "geometry_reasoning_nested_squares", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "font_recognition", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_count_line_intersections", + "score": 0.4642857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "circuit_diagram_understanding", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "go_capture_stone", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "monthly_weather_days_count", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "weather_map_climate_type_temperature_parsing", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "top_video_creator_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "rebus", + "score": 0.391304347826087, + "eval_type": "rule", + "num_demo": 1, + "num_query": 23 + }, + { + "name": "ishihara_test", + "score": 0.4857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "paper_vqa", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "signage_navigation", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "webpage_code_understanding", + "score": 0.7777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "medical_counting_lymphocytes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "game_platform_support_identification", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_swipe", + "score": 0.519652560877361, + "eval_type": "rule", + "num_demo": 1, + "num_query": 13 + }, + { + "name": "mahjong", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "scibench_calculus_wo_solution", + "score": 0.2653061224489796, + "eval_type": "rule", + "num_demo": 1, + "num_query": 49 + }, + { + "name": "knowledge_graph_understanding", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "image_translation_en2cn", + "score": 0.42609161120798006, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "realworld_qa_en2cn", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_visual_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "GUI_Act_Web_Multi", + "score": 0.4711837794298108, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chinese_idiom_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_comparison", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_blocksworld", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "product_ocr_qa", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "geometry_reasoning_circled_letter", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Web_Single", + "score": 0.049588137529181904, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "extract_webpage_headline", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_storage", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "soccer_offside", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 9 + }, + { + "name": "geometry_reasoning_grid", + "score": 0.6785714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "relative_reflectance_of_different_regions", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "entertainment_web_game_style", + "score": 0.75, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "orchestra_score_recognition", + "score": 0.10714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "icon_arithmetic_puzzle", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_grippers", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "MMMU_pro_exam_screenshot", + "score": 0.24242424242424243, + "eval_type": "rule", + "num_demo": 1, + "num_query": 99 + }, + { + "name": "clevrer_physics", + "score": 0.55, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "MMMU_physics_chemistry_selected", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_tyreworld", + "score": 0.06666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "music_sheet_note_count", + "score": 0.17647058823529413, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "hashtag_recommendation", + "score": 0.9226190476190476, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "llavaguard", + "score": 0.25, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_multi_organ_segmentation_rater", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "cultural_vqa", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "logical_reasoning_fit_pattern", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "character_recognition_in_TV_shows", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "highest_discount_game_price_identification", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "remaining_playback_time_calculation", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "medical_cell_recognition", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_find_legal_moves", + "score": 0.04744105231699729, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "distinguish_ai_generated_image", + "score": 0.6842105263157895, + "eval_type": "rule", + "num_demo": 1, + "num_query": 19 + }, + { + "name": "autonomous_driving_scene_analysis", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counting_single_image", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "GUI_Act_Mobile_tap", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "road_map_find_highway_between_two_place", + "score": 0.5882352941176471, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "chess_sygyzy_endgames", + "score": 0.09471861471861472, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "planning_screenshot_termes", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "multiple_states_identify_asia", + "score": 0.5428571428571428, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_africa", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_europe", + "score": 0.5857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "multiple_states_identify_americas", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "adapted_cvbench_distance", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "adapted_cvbench_count", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "adapted_cvbench_depth", + "score": 1.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "adapted_cvbench_relation", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_computer_aided_design", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "symbolic_graphics_programs_scalable_vector_graphics", + "score": 0.2777777777777778, + "eval_type": "rule", + "num_demo": 1, + "num_query": 18 + }, + { + "name": "table_understanding_complex_question_answering", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "table_understanding_fact_verification", + "score": 0.7619047619047618, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_multi_question", + "score": 0.8095238095238094, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "panel_images_single_question", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Misinformation_GossipCop", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_HatefulMemes", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "MMSoc_Memotion", + "score": 0.6470588235294119, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "MMSoc_Misinformation_PolitiFact", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "poetry_acrostic_alliteration", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_acrostic", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_limerick", + "score": 0.7333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_custom_rhyming_scheme", + "score": 0.13333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_petrarchian_sonnet_optional_meter", + "score": 0.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_haiku", + "score": 1.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "poetry_shakespearean_sonnet", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "screenshot_lighteval_math", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "screenshot_theoremqa", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "number_puzzle_sudoku", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "number_puzzle_kakuro_5x5", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "text_entity_replace", + "score": 0.7142857142857143, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "background_change", + "score": 0.7857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_attribute_edit", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "face_swap", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "text_style", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "out_of_context", + "score": 0.9285714285714286, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "clip_stable_diffusion_generate", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "veracity", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "counterfactual_arithmetic", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "maze_2d_8x8", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_shapes", + "score": 0.40136054421768713, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "shape_composition_colours", + "score": 0.4562641723356009, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "autorater_artifact_reason", + "score": 0.5333333333333333, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "chess_puzzles_crushing", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_checkmate", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "chess_puzzles_equality", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_notes", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_twitter", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_youtube", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_tiktok", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_excel", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_amazon", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_instagram", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_zoom", + "score": 0.3333333333333333, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "app_layout_understanding_word", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_iphone_settings", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_leetcode", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_ppt", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "app_layout_understanding_alipay", + "score": 0.7058823529411765, + "eval_type": "rule", + "num_demo": 1, + "num_query": 17 + }, + { + "name": "ocr_table_to_markdown", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_latex", + "score": 0.6428571428571429, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_employer_plain", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_journal", + "score": 0.07142857142857142, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_experience_plain", + "score": 0.42857142857142855, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_text_latex", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_article_authors", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_csv", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_math_equation", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_school_plain", + "score": 0.2857142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_table_to_html", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "ocr_resume_skill_plain", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "crossword_mini_5x5", + "score": 0.4285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "contain_position_length", + "score": 0.6, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "contain_repeat_length", + "score": 0.26666666666666666, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "contain_length", + "score": 0.8666666666666667, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "contain_contain_length", + "score": 1.0, + "eval_type": "rule", + "num_demo": 0, + "num_query": 15 + }, + { + "name": "pictionary_skribbl_io", + "score": 0.2, + "eval_type": "rule", + "num_demo": 1, + "num_query": 20 + }, + { + "name": "pictionary_doodle_guess", + "score": 0.8, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "pictionary_genai_output_chinese", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_cartoon_drawing_guess", + "score": 0.8571428571428571, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "pictionary_chinese_food_img2en", + "score": 0.5714285714285714, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "reward_models_i2t_reward", + "score": 0.35714285714285715, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_chinese_celebrity", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_papers", + "score": 0.4666666666666667, + "eval_type": "rule", + "num_demo": 1, + "num_query": 15 + }, + { + "name": "memorization_famous_treaty", + "score": 0.5357142857142857, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "memorization_indian_celebrity", + "score": 0.5, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_blogpost", + "score": 0.0, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_publication", + "score": 0.14285714285714285, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + }, + { + "name": "research_website_parsing_homepage", + "score": 0.21428571428571427, + "eval_type": "rule", + "num_demo": 1, + "num_query": 14 + } +] \ No newline at end of file diff --git a/static/eval_results/SI/Idefics3/summary_results.json b/static/eval_results/SI/Idefics3/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Idefics3/summary_results.json rename to static/eval_results/SI/Idefics3/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/InternVL2_2B/summary_results.json b/static/eval_results/SI/InternVL2_2B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/InternVL2_2B/summary_results.json rename to static/eval_results/SI/InternVL2_2B/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/InternVL2_76B/summary_results.json b/static/eval_results/SI/InternVL2_76B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/InternVL2_76B/summary_results.json rename to static/eval_results/SI/InternVL2_76B/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/InternVL2_8B/summary_results.json b/static/eval_results/SI/InternVL2_8B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/InternVL2_8B/summary_results.json rename to static/eval_results/SI/InternVL2_8B/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Llama_3_2_11B/summary_results.json b/static/eval_results/SI/Llama_3_2_11B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Llama_3_2_11B/summary_results.json rename to static/eval_results/SI/Llama_3_2_11B/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/MiniCPM_v2.6/summary_results.json b/static/eval_results/SI/MiniCPM_v2.6/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/MiniCPM_v2.6/summary_results.json rename to static/eval_results/SI/MiniCPM_v2.6/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Molmo_72B/summary_results.json b/static/eval_results/SI/Molmo_72B/summary_and_keyword_stats.json similarity index 98% rename from static/eval_results/SI/Molmo_72B/summary_results.json rename to static/eval_results/SI/Molmo_72B/summary_and_keyword_stats.json index ceecd07087332338c36b4975d37ee56f4c31cc00..e8fe6912b753f368af2f70c3a35f80ade94de02b 100644 --- a/static/eval_results/SI/Molmo_72B/summary_results.json +++ b/static/eval_results/SI/Molmo_72B/summary_and_keyword_stats.json @@ -7,8 +7,8 @@ "macro_mean_score": 0.36480000609384927, "missing_tasks": [ "planning_screenshot_termes", - "table_understanding", - "MMSoc_Misinformation_PolitiFact" + "MMSoc_Misinformation_PolitiFact", + "table_understanding" ] }, "open": { diff --git a/static/eval_results/SI/Molmo_72B/task_results.json b/static/eval_results/SI/Molmo_72B/task_results.json index 5afd5f91add75e7356970c4840d0ccf634eb7ede..71c9bf3c93b20a7273a6d3c0d0c856dbc5b01a52 100644 --- a/static/eval_results/SI/Molmo_72B/task_results.json +++ b/static/eval_results/SI/Molmo_72B/task_results.json @@ -1897,14 +1897,14 @@ "num_query": 0 }, { - "name": "table_understanding", + "name": "MMSoc_Misinformation_PolitiFact", "score": 0.0, "eval_type": "rule", "num_demo": 0, "num_query": 0 }, { - "name": "MMSoc_Misinformation_PolitiFact", + "name": "table_understanding", "score": 0.0, "eval_type": "rule", "num_demo": 0, diff --git a/static/eval_results/SI/Molmo_7B_D/summary_results.json b/static/eval_results/SI/Molmo_7B_D/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Molmo_7B_D/summary_results.json rename to static/eval_results/SI/Molmo_7B_D/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/NVLM/summary_results.json b/static/eval_results/SI/NVLM/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/NVLM/summary_results.json rename to static/eval_results/SI/NVLM/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/POINTS_15_7B/summary_results.json b/static/eval_results/SI/POINTS_15_7B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/POINTS_15_7B/summary_results.json rename to static/eval_results/SI/POINTS_15_7B/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/POINTS_7B/summary_results.json b/static/eval_results/SI/POINTS_7B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/POINTS_7B/summary_results.json rename to static/eval_results/SI/POINTS_7B/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Phi-3.5-vision/summary_results.json b/static/eval_results/SI/Phi-3.5-vision/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Phi-3.5-vision/summary_results.json rename to static/eval_results/SI/Phi-3.5-vision/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Pixtral_12B/summary_results.json b/static/eval_results/SI/Pixtral_12B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Pixtral_12B/summary_results.json rename to static/eval_results/SI/Pixtral_12B/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Qwen2_VL_2B/summary_results.json b/static/eval_results/SI/Qwen2_VL_2B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Qwen2_VL_2B/summary_results.json rename to static/eval_results/SI/Qwen2_VL_2B/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Qwen2_VL_72B/summary_results.json b/static/eval_results/SI/Qwen2_VL_72B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Qwen2_VL_72B/summary_results.json rename to static/eval_results/SI/Qwen2_VL_72B/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/Qwen2_VL_7B/summary_results.json b/static/eval_results/SI/Qwen2_VL_7B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/Qwen2_VL_7B/summary_results.json rename to static/eval_results/SI/Qwen2_VL_7B/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/SmolVLM/summary_results.json b/static/eval_results/SI/SmolVLM/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/SmolVLM/summary_results.json rename to static/eval_results/SI/SmolVLM/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/llava_onevision_72B/summary_results.json b/static/eval_results/SI/llava_onevision_72B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/llava_onevision_72B/summary_results.json rename to static/eval_results/SI/llava_onevision_72B/summary_and_keyword_stats.json diff --git a/static/eval_results/SI/llava_onevision_7B/summary_results.json b/static/eval_results/SI/llava_onevision_7B/summary_and_keyword_stats.json similarity index 100% rename from static/eval_results/SI/llava_onevision_7B/summary_results.json rename to static/eval_results/SI/llava_onevision_7B/summary_and_keyword_stats.json