Iker's picture
Text fixes
11def42
raw
history blame
5.9 kB
import gradio as gr
import pandas as pd
from dataset import get_dataframe
from markdown import GUIDELINES, PANEL_MARKDOWN
df = get_dataframe()
def filter_dataframe(dataframe, eval_dataset, cont_source, checkboxes):
"""
Filter the dataframe based on the provided evaluation dataset, contaminated source, and checkboxes.
Args:
dataframe (pandas.DataFrame): The input dataframe to filter.
eval_dataset (str): The evaluation dataset to filter by.
cont_source (str): The contaminated source to filter by.
checkboxes (list): The checkboxes to filter by.
Returns:
pandas.DataFrame: The filtered dataframe.
"""
if isinstance(eval_dataset, str):
dataframe = dataframe[
dataframe["Evaluation Dataset"].str.contains(eval_dataset)
]
if isinstance(cont_source, str):
dataframe = dataframe[
dataframe["Contaminated Source"].str.contains(cont_source)
]
if isinstance(checkboxes, list) and "Exclude model-based evidences" in checkboxes:
dataframe = dataframe[dataframe["Approach"] != "model-based"]
if isinstance(checkboxes, list) and "Show only contaminated" in checkboxes:
dataframe = dataframe[
(dataframe["Train Split"] > 0.0)
| (dataframe["Development Split"] > 0.0)
| (dataframe["Test Split"] > 0.0)
]
return dataframe
def filter_dataframe_corpus(*args, **kwargs) -> pd.DataFrame:
"""
Filter the dataframe for corpus contamination.
Returns:
pandas.DataFrame: The filtered dataframe for corpus contamination.
"""
# Get rows in which the column Model or corpus is equal to dataset
filtered_df = df[df["Model or corpus"] == "corpus"]
filtered_df = filtered_df.drop(columns=["Model or corpus"])
return filter_dataframe(filtered_df, *args, **kwargs)
def filter_dataframe_model(*args, **kwargs) -> pd.DataFrame:
"""
Filter the dataframe for model contamination.
Returns:
pandas.DataFrame: The filtered dataframe for model contamination.
"""
# Get rows in which the column Model or corpus is equal to dataset
filtered_df = df[df["Model or corpus"] == "model"]
filtered_df = filtered_df.drop(columns=["Model or corpus"])
return filter_dataframe(filtered_df, *args, **kwargs)
theme = gr.themes.Soft(
primary_hue="emerald",
secondary_hue="red",
text_size="sm",
spacing_size="sm",
font=[
gr.themes.GoogleFont("Poppins"),
gr.themes.GoogleFont("Poppins"),
gr.themes.GoogleFont("Poppins"),
gr.themes.GoogleFont("Poppins"),
],
).set(block_background_fill="*neutral_50", block_background_fill_dark="*neutral_950")
with gr.Blocks(
theme=theme,
title="πŸ’¨ Data Contamination Report",
analytics_enabled=False,
) as demo:
gr.Markdown(PANEL_MARKDOWN)
with gr.Tab("Corpus contamination") as tab_corpus:
with gr.Row(variant="compact"):
with gr.Column():
eval_dataset_corpus = gr.Textbox(
placeholder="Evaluation dataset",
label="Evaluation dataset",
value="",
)
cont_corpora = gr.Textbox(
placeholder="Pre-training corpora",
label="Pre-training corpora",
value="",
)
with gr.Column():
checkboxes_corpus = gr.CheckboxGroup(
["Exclude model-based evidences", "Show only contaminated"],
label="Search options",
value=[],
)
filter_corpus_btn = gr.Button("Filter")
corpus_dataframe = gr.DataFrame(
value=filter_dataframe_corpus(
eval_dataset_corpus, cont_corpora, checkboxes_corpus
).style.format(precision=2),
headers=df.columns.to_list(),
datatype=[
"markdown",
"markdown",
"number",
"number",
"number",
"str",
"markdown",
"markdown",
],
)
with gr.Tab("Model contamination") as tab_model:
with gr.Row(variant="compact"):
with gr.Column():
eval_dataset_model = gr.Textbox(
placeholder="Evaluation dataset",
label="Evaluation dataset",
value="",
)
cont_model = gr.Textbox(
placeholder="Model", label="Pre-training corpora", value=""
)
with gr.Column():
checkboxes_model = gr.CheckboxGroup(
["Exclude model-based evidences", "Show only contaminated"],
label="Search options",
value=[],
)
filter_model_btn = gr.Button("Filter")
model_dataframe = gr.DataFrame(
value=filter_dataframe_model(
eval_dataset_model, cont_model, checkboxes_model
),
headers=df.columns.to_list(),
datatype=[
"markdown",
"markdown",
"number",
"number",
"number",
"str",
"markdown",
"markdown",
],
)
filter_corpus_btn.click(
filter_dataframe_corpus,
inputs=[eval_dataset_corpus, cont_corpora, checkboxes_corpus],
outputs=corpus_dataframe,
)
filter_model_btn.click(
filter_dataframe_model,
inputs=[eval_dataset_model, cont_model, checkboxes_model],
outputs=model_dataframe,
)
with gr.Tab("Contribution Guidelines") as tab_guidelines:
gr.Markdown(GUIDELINES)
demo.launch()