__all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissions'] import gradio as gr import pandas as pd import re import pandas as pd import numpy as np from collections import defaultdict from constants import * import os from huggingface_hub import Repository import json global data_component_aad, data_component_iasd, data_component_ivqd, filter_component TOKEN = os.environ.get("TOKEN") repo = Repository(local_dir="./download_from_dataset", clone_from="MM-UPD/results_for_leaderboard", repo_type="dataset", use_auth_token=TOKEN) current_directory = os.getcwd() def validate_model_size(s): pattern = r'^\d+B$|^-$' if re.match(pattern, s): return s else: return '-' def upload_file(files): file_paths = [file.name for file in files] return file_paths def create_df(input_file): json_string = input_file.decode('utf-8') data = json.loads(json_string) df = pd.DataFrame(data) return df # Accuracy Report def report_acc(df, groupd='category', metric_type="dual"): assert 'split' in df assert groupd in [None, 'category', 'l2-category'] res = defaultdict(list) res['split'] = ['test'] if groupd is None: if metric_type == "dual": res['overall'] = [ np.mean(df['hit']), ] elif metric_type == "standard": res['overall'] = [ np.mean(df['hit_standard']), ] elif metric_type == "upd": res['overall'] = [ np.mean(df['hit_upd']), ] return pd.DataFrame(res) elif groupd in df: abilities = list(set(df[groupd])) abilities.sort() for ab in abilities: sub_df = df[df[groupd] == ab] if metric_type == "dual": res[ab] = [ np.mean(sub_df['hit']), ] elif metric_type == "standard": res[ab] = [ np.mean(sub_df['hit_standard']), ] elif metric_type == "upd": res[ab] = [ np.mean(sub_df['hit_upd']), ] return pd.DataFrame(res) def eval_result_dual(data_main, metric_type="dual"): overall = report_acc(data_main, None, metric_type) leaf = report_acc(data_main, 'category', metric_type) overall = round(overall['overall'].values[0] * 100, 1) leaf = leaf.iloc[:, 1:].values.flatten().tolist() leaf = [round(x * 100, 1) for x in leaf] return overall, leaf def calculate_score(input_file): dual_df = create_df(input_file) overall_dual, leaf_dual = eval_result_dual(dual_df) overall_standard, leaf_standard = eval_result_dual(dual_df, metric_type="standard") overall_upd, leaf_upd = eval_result_dual(dual_df, metric_type="upd") return overall_dual, overall_standard, overall_upd, leaf_dual # add the new data into the queue def add_queue(base_df, input_file, model_name): dual_df = create_df(input_file) base_df[f"{model_name}_prediction_standard"] = dual_df["prediction_standard"] base_df[f"{model_name}_hit_standard"] = dual_df["hit_standard"] base_df[f"{model_name}_prediction_upd"] = dual_df["prediction_upd"] base_df[f"{model_name}_hit_upd"] = dual_df["hit_upd"] base_df[f"{model_name}_hit"] = dual_df["hit"] return base_df # check whether the input file is correct or not def validity_check(input_file, UPD_type, question_type): input_df = create_df(input_file) # check for the correct data size data_num_dict = {"AAD": 820, "IASD": 919, "IVQD": 356} assert len(input_df) == data_num_dict[UPD_type], "Different Data Size" print("len(input)", len(input_df)) print("data_num_dict[UPD_type]", data_num_dict[UPD_type]) # check for missing columns column_list = ["hit_upd", "hit_standard", "hit", "prediction_upd", "prediction_standard"] assert all(x in input_df.columns for x in column_list), "Column Missing" # check for missing values assert not input_df[column_list].isnull().any().any(), "Missing values found in columns" # check for the presence of the correct values option_mapping = {"AAD": "None of the above", "IASD": "None of the above", "IVQD": "The image and question are irrelevant."} instruction_mapping = {"AAD": "F. None of the above", "IASD": "F. None of the above", "IVQD": "F. The image and question are irrelevant."} input_df["D_upd"] = input_df["D_upd"].fillna("") if question_type == "Base": assert not input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]} found in Base" assert not input_df["prediction_upd"].str.contains(instruction_mapping[UPD_type]).any(), f"{instruction_mapping[UPD_type]} found in Base" elif question_type == "Option": assert input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]}not found in Option" assert not input_df["prediction_upd"].str.contains(instruction_mapping[UPD_type]).any(), f"{instruction_mapping[UPD_type]} found in Option" elif question_type == "Instruction": assert not input_df["D_upd"].str.contains(option_mapping[UPD_type]).any(), f"{option_mapping[UPD_type]} found in Instruction" return True def add_new_eval( input_file, model_type: str, model_name_textbox: str, revision_name_textbox: str, model_link: str, model_size: str, upd_type: str, LLM_type: str, LLM_name_textbox: str, question_type: str ): if input_file is None: warning_text = "Error! Empty file!" print(warning_text) return warning_text else: model_size = validate_model_size(model_size) if upd_type == 'AAD': csv_path = CSV_AAD_RESULT_PATH elif upd_type == 'IASD': csv_path = CSV_IASD_RESULT_PATH elif upd_type == 'IVQD': csv_path = CSV_IVQD_RESULT_PATH validity_check(input_file, upd_type, question_type) csv_data = pd.read_csv(csv_path) overall_dual_acc, overall_standard_acc, overall_upd_acc, leaf_dual = calculate_score(input_file) if LLM_type == 'Other': LLM_name = LLM_name_textbox else: LLM_name = LLM_type if revision_name_textbox == '': col = csv_data.shape[0] model_name = model_name_textbox else: model_name = revision_name_textbox model_name_list = csv_data['Model'] name_list = [name.split(']')[0][1:] for name in model_name_list] if revision_name_textbox not in name_list: col = csv_data.shape[0] else: col = name_list.index(revision_name_textbox) if model_link == '': model_name = model_name # no url else: model_name = '[' + model_name + '](' + model_link + ')' # add new data new_data = [ model_type, model_name, LLM_name, model_size, question_type, overall_dual_acc, overall_standard_acc, overall_upd_acc, ] new_data += leaf_dual # If the same data already exists, return an error. if new_data in csv_data.values.tolist(): warning_text = "Error! The same data already exists!" print(warning_text) return warning_text # If the same model name already exists, return an error. elif new_data[:5] in csv_data.values.tolist(): warning_text = "Error! The same data already exists! Please fill revision_name." print(warning_text) return warning_text csv_data.loc[col] = new_data csv_data = csv_data.to_csv(csv_path, index=False) absolute_result_path = os.path.abspath(csv_path) if not os.path.exists(absolute_result_path): raise FileNotFoundError(f"File {absolute_result_path} not found") repo.git_pull() repo.git_add(absolute_result_path) csv_queue_path = os.path.join(CSV_QUEUE_DIR, f"detail_results_{upd_type.lower()}_{question_type.lower()}.csv") base_data = pd.read_csv(csv_queue_path) base_data = add_queue(base_data, input_file, model_name) base_data.to_csv(csv_queue_path, index=False) absolute_queue_path = os.path.abspath(csv_queue_path) if not os.path.exists(absolute_queue_path): raise FileNotFoundError(f"File {absolute_queue_path} not found") repo.git_add(absolute_queue_path) repo.git_commit(f"add {model_name} results in {question_type}") repo.git_push() return 0 def get_baseline_aad_df(): repo.git_pull() df = pd.read_csv(CSV_AAD_RESULT_PATH) df = df.sort_values(by="Overall Dual Acc.", ascending=False) present_columns = MODEL_INFO + checkbox_aad_group.value df = df[present_columns] return df def get_all_aad_df(): repo.git_pull() df = pd.read_csv(CSV_AAD_RESULT_PATH) df = df.sort_values(by="Overall Dual Acc.", ascending=False) return df def get_baseline_iasd_df(): repo.git_pull() df = pd.read_csv(CSV_IASD_RESULT_PATH) df = df.sort_values(by="Overall Dual Acc.", ascending=False) present_columns = MODEL_INFO + checkbox_iasd_group.value df = df[present_columns] return df def get_all_iasd_df(): repo.git_pull() df = pd.read_csv(CSV_IASD_RESULT_PATH) df = df.sort_values(by="Overall Dual Acc.", ascending=False) return df def get_baseline_ivqd_df(): repo.git_pull() df = pd.read_csv(CSV_IVQD_RESULT_PATH) df = df.sort_values(by="Overall Dual Acc.", ascending=False) present_columns = MODEL_INFO + checkbox_ivqd_group.value df = df[present_columns] return df def get_all_ivqd_df(): repo.git_pull() df = pd.read_csv(CSV_IVQD_RESULT_PATH) df = df.sort_values(by="Overall Dual Acc.", ascending=False) return df block = gr.Blocks() with block: gr.Markdown( LEADERBORAD_INTRODUCTION ) with gr.Tabs(elem_classes="tab-buttons") as tabs: # table mmupd bench with gr.TabItem("🏅 MM-AAD Benchmark", elem_id="mmaad-benchmark-tab-table", id=1): # selection for column part: checkbox_aad_group = gr.CheckboxGroup( choices=TASK_AAD_INFO, value=AVG_INFO, label="Evaluation Dimension", interactive=True, ) # user can select the evaluation dimension with gr.Row(): # selection for model size part: model_size = gr.CheckboxGroup( choices=MODEL_SIZE, value=MODEL_SIZE, label="Model Size", interactive=True, ) # selection for model size part: question_type = gr.CheckboxGroup( choices=QUESTION_TYPE, value=QUESTION_TYPE, label="Question Type", interactive=True, ) baseline_value = get_baseline_aad_df() baseline_header = MODEL_INFO + checkbox_aad_group.value baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_aad_group.value) data_component_aad = gr.components.Dataframe( value=baseline_value, headers=baseline_header, type="pandas", datatype=baseline_datatype, interactive=False, visible=True, ) def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns): updated_data = get_all_aad_df() # model_size & question_type: def custom_filter(row, model_size_filters, question_type_filters): model_size = row['Model Size'] question_type = row['Question Type'] model_size = model_size.upper() if model_size == '-': size_filter = '-' in model_size_filters elif 'B' in model_size: size = float(model_size.replace('B', '')) size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10) else: size_filter = False question_type_filter = question_type in question_type_filters return size_filter and question_type_filter mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type) updated_data = updated_data[mask] # columns: selected_columns = [item for item in TASK_AAD_INFO if item in selected_columns] present_columns = MODEL_INFO + selected_columns updated_data = updated_data[present_columns] updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False) updated_headers = present_columns update_datatype = [DATA_AAD_TITILE_TYPE[COLUMN_AAD_NAMES.index(x)] for x in updated_headers] filter_component = gr.components.Dataframe( value=updated_data, headers=updated_headers, type="pandas", datatype=update_datatype, interactive=False, visible=True, ) return filter_component model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad) question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad) checkbox_aad_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad) with gr.Row(): with gr.Accordion("Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", show_copy_button=True, ) with gr.TabItem("🏅 MM-IASD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=2): checkbox_iasd_group = gr.CheckboxGroup( choices=TASK_IASD_INFO, value=AVG_INFO, label="Evaluation Dimension", interactive=True, ) # user can select the evaluation dimension with gr.Row(): # selection for model size part: model_size = gr.CheckboxGroup( choices=MODEL_SIZE, value=MODEL_SIZE, label="Model Size", interactive=True, ) # selection for model size part: question_type = gr.CheckboxGroup( choices=QUESTION_TYPE, value=QUESTION_TYPE, label="Question Type", interactive=True, ) baseline_value = get_baseline_iasd_df() baseline_header = MODEL_INFO + checkbox_iasd_group.value baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_iasd_group.value) data_component_iasd = gr.components.Dataframe( value=baseline_value, headers=baseline_header, type="pandas", datatype=baseline_datatype, interactive=False, visible=True, ) def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns): updated_data = get_all_iasd_df() def custom_filter(row, model_size_filters, question_type_filters): model_size = row['Model Size'] question_type = row['Question Type'] model_size = model_size.upper() if model_size == '-': size_filter = '-' in model_size_filters elif 'B' in model_size: size = float(model_size.replace('B', '')) size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10) else: size_filter = False question_type_filter = question_type in question_type_filters return size_filter and question_type_filter mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type) updated_data = updated_data[mask] # columns: selected_columns = [item for item in TASK_IASD_INFO if item in selected_columns] present_columns = MODEL_INFO + selected_columns updated_data = updated_data[present_columns] updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False) updated_headers = present_columns update_datatype = [DATA_IASD_TITILE_TYPE[COLUMN_IASD_NAMES.index(x)] for x in updated_headers] filter_component = gr.components.Dataframe( value=updated_data, headers=updated_headers, type="pandas", datatype=update_datatype, interactive=False, visible=True, ) return filter_component model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd) question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd) checkbox_iasd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd) with gr.Row(): with gr.Accordion("Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", show_copy_button=True, ) # Table 3 with gr.TabItem("🏅 MM-IVQD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=3): with gr.Row(): # selection for column part: checkbox_ivqd_group = gr.CheckboxGroup( choices=TASK_IVQD_INFO, value=AVG_INFO, label="Evaluation Dimension", interactive=True, ) # user can select the evaluation dimension with gr.Row(): # selection for model size part: model_size = gr.CheckboxGroup( choices=MODEL_SIZE, value=MODEL_SIZE, label="Model Size", interactive=True, ) # selection for model size part: question_type = gr.CheckboxGroup( choices=QUESTION_TYPE, value=QUESTION_TYPE, label="Question Type", interactive=True, ) baseline_value = get_baseline_ivqd_df() baseline_header = MODEL_INFO + checkbox_ivqd_group.value baseline_datatype = ['markdown'] * 4 + ['number'] * len(checkbox_ivqd_group.value) data_component_ivqd = gr.components.Dataframe( value=baseline_value, headers=baseline_header, type="pandas", datatype=baseline_datatype, interactive=False, visible=True, ) def on_filter_model_size_method_change(selected_model_size, selected_question_type, selected_columns): updated_data = get_all_ivqd_df() def custom_filter(row, model_size_filters, question_type_filters): model_size = row['Model Size'] question_type = row['Question Type'] model_size = model_size.upper() if model_size == '-': size_filter = '-' in model_size_filters elif 'B' in model_size: size = float(model_size.replace('B', '')) size_filter = ('>=10B' in model_size_filters and size >= 10) or ('<10B' in model_size_filters and size < 10) else: size_filter = False question_type_filter = question_type in question_type_filters return size_filter and question_type_filter mask = updated_data.apply(custom_filter, axis=1, model_size_filters=selected_model_size, question_type_filters=selected_question_type) updated_data = updated_data[mask] selected_columns = [item for item in TASK_IVQD_INFO if item in selected_columns] present_columns = MODEL_INFO + selected_columns updated_data = updated_data[present_columns] updated_data = updated_data.sort_values(by=selected_columns[0], ascending=False) updated_headers = present_columns update_datatype = [DATA_IVQD_TITILE_TYPE[COLUMN_IVQD_NAMES.index(x)] for x in updated_headers] filter_component = gr.components.Dataframe( value=updated_data, headers=updated_headers, type="pandas", datatype=update_datatype, interactive=False, visible=True, ) return filter_component model_size.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd) question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd) checkbox_ivqd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd) with gr.Accordion("Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", show_copy_button=True, ) # table 4 with gr.TabItem("📝 About", elem_id="mmupd-benchmark-tab-table", id=4): gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text") # table 5 with gr.TabItem("🚀 Submit here! ", elem_id="mmupd-benchmark-tab-table", id=5): with gr.Row(): gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text") with gr.Row(): gr.Markdown("# ✉️✨ Submit your model evaluation json file here!", elem_classes="markdown-text") with gr.Row(): with gr.Column(): model_type = gr.Dropdown( choices=["VLM", "LLM"], label="Model type", multiselect=False, value="VLM", interactive=True, ) model_name_textbox = gr.Textbox( label="Model name", placeholder="LLaMA-7B" ) revision_name_textbox = gr.Textbox( label="Revision Model Name", placeholder="LLaMA-7B" ) model_link = gr.Textbox( label="Model Link", placeholder="https://huggingface.co/decapoda-research/llama-7b-hf" ) model_size = gr.Textbox( label="Model size", placeholder="7B(Input content format must be 'number+B' or '-', default is '-')" ) with gr.Column(): LLM_type = gr.Dropdown( choices=["Vicuna-1.5-7B", "Vicuna-1.5-13B", "Flan-T5-XL", "LLaMA-7B", "Llama-13B", "Llama-3-8B", "Llama-3-70B", "Yi-34B", "Mistral-7B", "Other"], label="LLM type", multiselect=False, value="Vicuna-1.5-13B", interactive=True, ) LLM_name_textbox = gr.Textbox( label="LLM model (Required for Other)", placeholder="GPT-4", ) upd_type = gr.Dropdown( choices=[ "AAD", "IASD", "IVQD", ], label="UPD type", multiselect=False, value="AAD", interactive=True, ) question_type = gr.Dropdown( choices=QUESTION_TYPE, label="Question Type", multiselect=False, value=QUESTION_TYPE[0], interactive=True, ) with gr.Column(): input_file = gr.components.File(label="Click to Upload a Dual Evaluation File", file_count="single", type='binary') submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( add_new_eval, inputs = [ input_file, model_type, model_name_textbox, revision_name_textbox, model_link, model_size, upd_type, LLM_type, LLM_name_textbox, question_type ], ) def refresh_data(): value1 = get_baseline_aad_df() value2 = get_baseline_iasd_df() value3 = get_baseline_ivqd_df() return value1, value2, value3 with gr.Row(): data_run = gr.Button("Refresh") data_run.click( refresh_data, outputs=[data_component_aad, data_component_iasd, data_component_ivqd] ) block.launch()