ko-bench / app.py
davidkim205's picture
Add application file
4a3b1dc
raw
history blame
11.6 kB
import gradio as gr
import pandas as pd
import numpy as np
import plotly.express as px
import random
import plotly.graph_objects as go
file_result_score = 'ko_bench.csv'
file_full_lb = 'mt_bench_240805.csv'
# read csv
df_result_score = pd.read_csv(file_result_score)
df_full_lb = pd.read_csv(file_full_lb)
# dataframe
df = pd.DataFrame(df_result_score)
df_rs = pd.DataFrame(df_result_score)
df_full_lboard = pd.DataFrame(df_full_lb)
df_full_lboard.replace('GPT-4-1106-preview', 'gpt-4-0125-preview', inplace=True) # MT-bench์˜ GPT-4-1106-preview ๋ฅผ gpt-4-0125-preview๋กœ ๋ณ€๊ฒฝ
models = df_full_lboard['Model'].unique() # ์—ด ์ถ”๊ฐ€๋ฅผ ์œ„ํ•œ models ๋ฆฌ์ŠคํŠธ
df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
def custom_mean(series):
numeric_series = pd.to_numeric(series, errors='coerce') # ์‹œ๋ฆฌ์ฆˆ๋ฅผ ์ˆซ์ž๋กœ ๋ณ€ํ™˜
return numeric_series.mean() if not numeric_series.isna().all() else np.nan # NaN์ด ์•„๋‹Œ ๊ฐ’์ด ํ•˜๋‚˜๋ผ๋„ ์žˆ์œผ๋ฉด ํ‰๊ท  ๊ณ„์‚ฐ
def get_mt_bench(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ ์ •์˜
model_lower = model.lower()
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
if not matching_rows.empty:
return matching_rows['MT-bench (score)'].values[0]
return ''
def get_organization(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ ์ •์˜
if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
return 'Mistral'
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
return 'KISTI'
model_lower = model.lower()
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
if not matching_rows.empty:
return matching_rows['Organization'].values[0]
return ''
def get_license(model): # ๋Œ€์†Œ๋ฌธ์ž ๋ฌด์‹œํ•˜๊ณ  ๋ชจ๋ธ์„ ๋งค์นญํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜ ์ •์˜
if pd.Series(model).str.contains('mistral-large', case=False, regex=True).any():
return 'Apache-2.0'
elif pd.Series(model).str.contains('koni-llama3-8b', case=False, regex=True).any():
return 'llama3'
model_lower = model.lower()
matching_rows = df_full_lboard[df_full_lboard['Model'].str.lower() == model_lower]
if not matching_rows.empty:
return matching_rows['License'].values[0]
return ''
# dataframe_full
df_full_rs = df_rs.copy()
df_full_rs.rename(columns={'score': 'KO-Bench'}, inplace=True)
df_full_rs = df_full_rs.drop(columns=['Coding', 'Extraction', 'Humanities', 'Math', 'Reasoning', 'Roleplay', 'STEM', 'Writing'])
df_full_rs = df_full_rs.drop(columns=['turn']) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
df_full_rs = df_full_rs.groupby(['model', 'judge_model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model', 'judge_model']}).reset_index()
df_full_rs = df_full_rs.round(2)
df_full_rs.replace("", np.nan, inplace=True)
df_full_rs['KO-Bench/openai'] = '' # KO-Bench/openai, KO-Bench/keval ์—ด ์ถ”๊ฐ€
df_full_rs['KO-Bench/keval'] = ''
for idx, j_model in df_full_rs['judge_model'].items():
if j_model == 'keval':
df_full_rs.at[idx, 'KO-Bench/keval'] = df_full_rs.at[idx, 'KO-Bench']
else :
df_full_rs.at[idx, 'KO-Bench/openai'] = df_full_rs.at[idx, 'KO-Bench']
df_full_rs = df_full_rs.drop(columns=['judge_model'])
df_full_rs = df_full_rs.groupby(['model']).agg({col: custom_mean for col in df_full_rs.columns if col not in ['model']}).reset_index() # KO-Bench/openai, KO-Bench/keval ํ–‰ ํ•ฉ๋ณ‘
df_full_rs = df_full_rs.round(2)
df_full_rs.replace("", np.nan, inplace=True)
df_full_rs['MT-Bench'] = '' # MT-Bench ์—ด ์ถ”๊ฐ€
df_full_rs['MT-Bench'] = df_full_rs['model'].apply(get_mt_bench)
df_full_rs['MT-Bench'] = df_full_rs['MT-Bench'].str.replace('-', '', regex=False)
df_full_rs['Organization'] = '' # Organization ์—ด ์ถ”๊ฐ€
df_full_rs['Organization'] = df_full_rs['model'].apply(get_organization)
df_full_rs['License'] = '' # License ์—ด ์ถ”๊ฐ€
df_full_rs['License'] = df_full_rs['model'].apply(get_license)
df_full_rs = df_full_rs.sort_values(by='KO-Bench', ascending=False)
df_full_rs.insert(0, 'rank', range(1, len(df_full_rs) + 1))
df_full_rs = df_full_rs.drop(columns=['KO-Bench'])
plot_models = df_full_rs['model'].unique() # model detail view๋ฅผ ์œ„ํ•œ models ๋ฆฌ์ŠคํŠธ
# dataframe
df_rs['MT-Bench'] = '' # MT-Bench ์—ด ์ถ”๊ฐ€
df_rs['MT-Bench'] = df_rs['model'].apply(get_mt_bench)
df_rs['MT-Bench'] = df_rs['MT-Bench'].str.replace('-', '', regex=False)
df_rs.replace("", np.nan, inplace=True) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
# dataframe_openai
df_openai = pd.DataFrame(df_rs)
df_openai = df_openai[df_openai['judge_model'] != 'keval']
df_openai = df_openai.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
df_openai = df_openai.groupby('model').agg({col: custom_mean for col in df_openai.columns if col != 'model'}).reset_index()
df_openai = df_openai.round(2)
df_openai = df_openai.sort_values(by='score', ascending=False)
df_openai.insert(0, 'rank', range(1, len(df_openai) + 1))
# dataframe_keval
df_keval = pd.DataFrame(df_rs)
df_keval = df_keval[df_keval['judge_model'] == 'keval']
df_keval = df_keval.drop(columns=['judge_model', 'turn']) # ๋ชจ๋ธ๋ณ„ turn1,2 score ํ•ฉ๋ณ‘
df_keval = df_keval.groupby('model').agg({col: custom_mean for col in df_keval.columns if col != 'model'}).reset_index()
df_keval = df_keval.round(2)
df_keval = df_keval.sort_values(by='score', ascending=False)
df_keval.insert(0, 'rank', range(1, len(df_keval) + 1))
# model detail view
plot_models_list = plot_models.tolist()
CATEGORIES = ["Writing", "Roleplay", "Reasoning", "Math", "Coding", "Extraction", "STEM", "Humanities"]
category_labels = ['Selected model turn1', 'Selected model turn2', 'Top1 turn1', 'Top1 turn2']
random.seed(42)
def search_dataframe(query): # df ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ ์ •์˜
if not query:
return df # ๊ฒ€์ƒ‰์–ด๊ฐ€ ์—†์„ ๊ฒฝ์šฐ ์ „์ฒด DataFrame ๋ฐ˜ํ™˜
filtered_df = df[df.apply(lambda row: any(row.astype(str) == query), axis=1)]
return filtered_df
def radar_chart(categories, Selected_model_turn1, Selected_model_turn2, Top1_turn1, Top1_turn2): # plot ๊ทธ๋ฆฌ๋Š” ํ•จ์ˆ˜
#categories = categories.split(',')
Selected_model_turn1 = [item for sublist in Selected_model_turn1 for item in sublist]
Selected_model_turn2 = [item for sublist in Selected_model_turn2 for item in sublist]
Top1_turn1 = [item for sublist in Top1_turn1 for item in sublist]
Top1_turn2 = [item for sublist in Top1_turn2 for item in sublist]
values_lists = [
list(map(float, Selected_model_turn1)),
list(map(float, Selected_model_turn2)),
list(map(float, Top1_turn1)),
list(map(float, Top1_turn2))
]
fig = go.Figure()
for i, values in enumerate(values_lists):
if len(categories) != len(values):
return f"Error in dataset {i+1}: Number of categories and values must be the same."
fig.add_trace(go.Scatterpolar(
r=values + [values[0]], # Closing the loop of the radar chart
theta=categories + [categories[0]], # Closing the loop of the radar chart
mode='lines',
name=category_labels[i] # Label for the dataset
))
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, max(max(values) for values in values_lists)],
showline=True,
),
angularaxis=dict(
rotation=0,
direction='clockwise'
)
),
showlegend=True,
width=555, # ์ ์ ˆํ•œ ๋„ˆ๋น„ ์„ค์ •
height=550, # ์ ์ ˆํ•œ ๋†’์ด ์„ค์ •
margin=dict(l=1000, r=20, t=20, b=20),
autosize = False,
paper_bgcolor='white',
plot_bgcolor='lightgrey'
)
return fig
def search_openai_plot(dropdown_model): # openai plot ํ•จ์ˆ˜ ์ •์˜
condition1 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
openai_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
condition2 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
openai_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
condition3 = (df['judge_model'] != 'keval') & (df['turn'] == 1) & (df['model'] == df_openai.loc[0,'model'])
top1_openai_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()
condition4 = (df['judge_model'] != 'keval') & (df['turn'] == 2) & (df['model'] == df_openai.loc[0,'model'])
top1_openai_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
fig = radar_chart(CATEGORIES, openai_turn1, openai_turn2, top1_openai_turn1, top1_openai_turn2)
return fig
def search_keval_plot(dropdown_model): # keval plot ํ•จ์ˆ˜ ์ •์˜
condition1 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == dropdown_model)
keval_turn1 = df.loc[condition1, 'Coding':'Writing'].values.tolist()
condition2 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == dropdown_model)
keval_turn2 = df.loc[condition2, 'Coding':'Writing'].values.tolist()
condition3 = (df['judge_model'] == 'keval') & (df['turn'] == 1) & (df['model'] == df_keval.loc[0,'model'])
top1_keval_turn1 = df.loc[condition3, 'Coding':'Writing'].values.tolist()
condition4 = (df['judge_model'] == 'keval') & (df['turn'] == 2) & (df['model'] == df_keval.loc[0,'model'])
top1_keval_turn2 = df.loc[condition4, 'Coding':'Writing'].values.tolist()
fig = radar_chart(CATEGORIES, keval_turn1, keval_turn2, top1_keval_turn1, top1_keval_turn2)
return fig
#gradio
with gr.Blocks() as demo:
gr.Markdown("")
gr.Markdown("# ๐Ÿ† KO-Bench Leaderboard")
gr.Markdown("")
gr.Markdown("#### The Ko-bench is a leaderboard for evaluating the multi-level conversation ability and instruction-following ability of Korean Large Language Models (LLMs).")
gr.Markdown("- MT-Bench: a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.")
gr.Markdown("- KO-Bench/openai: a set of challenging multi-turn questions in Korean. We use GPT-4o to grade the model responses.")
gr.Markdown("- KO-Bench/keval: a set of challenging multi-turn questions in Korean. We use the keval model as an evaluation model.")
gr.Markdown("")
gr.Markdown("github : https://github.com/davidkim205/ko-bench")
gr.Markdown("keval : https://huggingface.co/collections/davidkim205/k-eval-6660063dd66e21cbdcc4fbf1")
gr.Markdown("")
with gr.TabItem("KO-Bench"):
gr.Dataframe(value=df_full_rs)
with gr.TabItem("Openai Judgment"):
gr.Dataframe(value=df_openai)
with gr.TabItem("Keval Judgment"):
gr.Dataframe(value=df_keval)
with gr.TabItem("Model Detail View"):
with gr.Blocks():
with gr.Row():
dropdown = gr.Dropdown(choices=plot_models_list, label="Choose a Model")
with gr.Row():
dataframe = gr.Dataframe(label="Model Detail View")
dropdown.change(fn=search_dataframe, inputs=dropdown, outputs=dataframe)
with gr.Row():
plot_openai = gr.Plot(label="Openai Plot")
dropdown.change(fn=search_openai_plot, inputs=dropdown, outputs=plot_openai)
#with gr.Row():
plot_keval = gr.Plot(label="Keval Plot")
dropdown.change(fn=search_keval_plot, inputs=dropdown, outputs=plot_keval)
demo.launch(share=True, server_name="0.0.0.0")