import gradio as gr
import pandas as pd
from src.css_html import custom_css
from src.utils import (
AutoEvalColumn,
fields,
make_clickable_names,
make_plot_data
)
from src.demo import (
generate,
random_examples,
return_ground_truth,
)
DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
MAX_MAX_NEW_TOKENS = 1024
DEFAULT_MAX_NEW_TOKENS = 512
df = pd.read_csv("data/eval_board.csv")
COLS = [c.name for c in fields(AutoEvalColumn)]
TYPES = [c.type for c in fields(AutoEvalColumn)]
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
def add_new_eval(
model: str,
re2text_easy_precision: str,
re2text_hard_precision: str,
text2re_easy_precision: str,
text2re_hard_precision: str,
links: str,
):
print("adding new eval")
eval_entry = {
"model": model,
"re2text_easy": re2text_easy_precision,
"re2text_hard": re2text_hard_precision,
"text2re_easy": text2re_easy_precision,
"text2re_hard": text2re_hard_precision,
"link": links
}
def select_columns(df, columns):
always_here_cols = [
AutoEvalColumn.model.name
]
# We use COLS to maintain sorting
filtered_df = df[
always_here_cols + [c for c in COLS if c in df.columns and c in columns]
]
return filtered_df
df["pure_name"] = df['Models']
df = make_clickable_names(df)
demo = gr.Blocks(css=custom_css)
with demo:
with gr.Row():
gr.Markdown(
"""
🤖 ConvRe 🤯 Leaderboard
""",
elem_classes="markdown-text",
)
gr.Markdown("""🤖**ConvRe**🤯 is the benchmark proposed in our EMNLP 2023 main conference paper: [An Investigation of LLMs’ Inefficacy in Understanding Converse Relations](https://arxiv.org/abs/2310.05163).
It aims to evaluate LLMs' ability on understanding converse relations.
Converse relation is defined as the opposite of semantic relation while keeping the surface form of the triple unchanged.
For example, the triple `(x, has part, y)` is interpreted as "x has a part called y" in normal relation, while "y has a part called x" in converse relation 🔁.
The experiments in our paper suggested that LLMs often resort to shortcut learning (or superficial correlations) and still face challenges on our 🤖ConvRe🤯 benchmark even for powerful models like GPT-4.
""", elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🔢 Data", id=0):
with gr.Accordion("➡️ See All Columns", open=False):
shown_columns = gr.CheckboxGroup(
choices=[
c for c in COLS if c not in [AutoEvalColumn.model.name]
],
value=[
c for c in COLS_LITE if c not in [AutoEvalColumn.model.name]
],
label="",
elem_id="column-select",
interactive=True
)
leaderboard_df_re2text = gr.components.Dataframe(
value=df[
[
AutoEvalColumn.model.name,
] + shown_columns.value
],
headers=[
AutoEvalColumn.model.name,
] + shown_columns.value,
datatype=TYPES,
elem_id="leaderboard-table",
interactive=False,
)
hidden_leaderboard_df_re2text = gr.components.DataFrame(
value=df,
headers=COLS,
datatype=["str" for _ in range(len(COLS))],
visible=False,
)
shown_columns.change(
select_columns,
[hidden_leaderboard_df_re2text, shown_columns],
leaderboard_df_re2text
)
with gr.TabItem("📊 Plot", id=1):
with gr.Row():
with gr.Column():
gr.LinePlot(
make_plot_data(df, task="Re2Text"),
x="Setting",
y="Accuracy",
color="Symbol",
title="Re2Text",
y_lim=[0, 100],
x_label_angle=0,
height=400,
width=500,
)
with gr.Column():
gr.LinePlot(
make_plot_data(df, task="Text2Re"),
x="Setting",
y="Accuracy",
color="Symbol",
title="Text2Re",
y_lim=[0, 100],
x_label_angle=0,
height=400,
width=500,
)
with gr.TabItem("Submit results 🚀", id=3):
gr.Markdown("""
Comming Soon ❤️
""")
with gr.Column():
gr.Markdown(
""" 🤖ConvRe🤯 Demo (Llama-2-Chat-7B🦙)
\
\
""",
elem_classes="markdown-text",
)
output_box = gr.Textbox(lines=10, max_lines=10, label="Llama-2-Chat-7B Answer", interactive=False)
input_box = gr.Textbox(lines=12, max_lines=12, label="User Input")
ground_truth_display = gr.Textbox("", lines=1, max_lines=1, label="😊Correct Answer😊", interactive=False)
with gr.Column():
with gr.Accordion("Additional Inputs", open=False):
sys_prompt = gr.Textbox(label="System prompt", value=DEFAULT_SYSTEM_PROMPT, lines=6)
max_new_tokens=gr.Slider(
label="Max new tokens",
minimum=1,
maximum=MAX_MAX_NEW_TOKENS,
step=1,
value=DEFAULT_MAX_NEW_TOKENS,
)
temperature = gr.Slider(
label="Temperature",
minimum=0.1,
maximum=4.0,
step=0.1,
value=0.1,
)
with gr.Row():
re2text_easy_btn = gr.Button("Random Re2Text Easy Example 😄")
re2text_easy_btn.click(
fn=random_examples,
inputs=gr.Text("re2text-easy", visible=False),
outputs = input_box,
)
re2text_hard_btn = gr.Button("Random Re2Text Hard Example 🤯")
re2text_hard_btn.click(
fn=random_examples,
inputs=gr.Text("re2text-hard", visible=False),
outputs=input_box,
)
text2re_easy_btn = gr.Button("Random Text2Re Easy Example 😄")
text2re_easy_btn.click(
fn=random_examples,
inputs=gr.Text("text2re-easy", visible=False),
outputs = input_box,
)
text2re_hard_btn = gr.Button("Random Text2Re Hard Example 🤯")
text2re_hard_btn.click(
fn=random_examples,
inputs=gr.Text("text2re-hard", visible=False),
outputs = input_box,
)
with gr.Row():
gr.ClearButton([input_box, output_box])
submit_btn = gr.Button("Submit🏃")
submit_btn.click(generate, inputs=[input_box, sys_prompt, temperature, max_new_tokens], outputs=[output_box])
answer_btn = gr.Button("Answer🤔")
answer_btn.click(return_ground_truth, inputs=[], outputs=[ground_truth_display])
demo.queue(max_size=32).launch(enable_queue=True)