|
import gradio as gr
|
|
import os
|
|
import json
|
|
import pandas as pd
|
|
from scripts.evaluate_information_integration import evaluate_information_integration
|
|
from scripts.evaluate_negative_rejection import evaluate_negative_rejection
|
|
from scripts.helper import update_config
|
|
from scripts.evaluate_noise_robustness import evaluate_noise_robustness
|
|
from scripts.evaluate_factual_robustness import evaluate_factual_robustness
|
|
|
|
|
|
Noise_Robustness_DIR = "results/Noise Robustness/"
|
|
Negative_Rejection_DIR = "results/Negative Rejection/"
|
|
Counterfactual_Robustness_DIR = "results/Counterfactual Robustness/"
|
|
Infomration_Integration_DIR = "results/Information Integration/"
|
|
|
|
|
|
def load_scores(file_dir):
|
|
models = set()
|
|
noise_rates = set()
|
|
|
|
if not os.path.exists(file_dir):
|
|
return pd.DataFrame(columns=["Noise Ratio"])
|
|
|
|
score_data = {}
|
|
|
|
|
|
for filename in os.listdir(file_dir):
|
|
if filename.startswith("scores_") and filename.endswith(".json"):
|
|
filepath = os.path.join(file_dir, filename)
|
|
with open(filepath, "r") as f:
|
|
score = json.load(f)
|
|
model = score["model"]
|
|
noise_rate = str(score["noise_rate"])
|
|
|
|
models.add(model)
|
|
noise_rates.add(noise_rate)
|
|
|
|
score_data[(model, noise_rate)] = score["accuracy"]
|
|
|
|
|
|
df = pd.DataFrame([
|
|
{
|
|
"Noise Ratio": model,
|
|
**{
|
|
rate: f"{score_data.get((model, rate), 'N/A') * 100:.2f}"
|
|
if score_data.get((model, rate), "N/A") != "N/A"
|
|
else "N/A"
|
|
for rate in sorted(noise_rates, key=float)
|
|
}
|
|
}
|
|
for model in sorted(models)
|
|
])
|
|
|
|
return df
|
|
|
|
|
|
def load_negative_rejection_scores():
|
|
if not os.path.exists(Negative_Rejection_DIR):
|
|
return pd.DataFrame()
|
|
|
|
score_data = {}
|
|
models = set()
|
|
|
|
for filename in os.listdir(Negative_Rejection_DIR):
|
|
if filename.startswith("scores_") and filename.endswith(".json") and "_noise_1.0_" in filename:
|
|
filepath = os.path.join(Negative_Rejection_DIR, filename)
|
|
with open(filepath, "r") as f:
|
|
score = json.load(f)
|
|
model = filename.split("_")[1]
|
|
models.add(model)
|
|
score_data[model] = score.get("reject_rate", "N/A")
|
|
|
|
df = pd.DataFrame([
|
|
{"Model": model, "Rejection Rate": f"{score_data.get(model, 'N/A') * 100:.2f}%"
|
|
if score_data.get(model, "N/A") != "N/A"
|
|
else "N/A"}
|
|
for model in sorted(models)
|
|
])
|
|
|
|
return df if not df.empty else pd.DataFrame(columns=["Model", "Rejection Rate"])
|
|
|
|
def load_counterfactual_robustness_scores():
|
|
models = set()
|
|
|
|
if not os.path.exists(Counterfactual_Robustness_DIR):
|
|
return pd.DataFrame(columns=["Noise Ratio"])
|
|
|
|
score_data = {}
|
|
|
|
|
|
for filename in os.listdir(Counterfactual_Robustness_DIR):
|
|
if filename.startswith("scores_") and filename.endswith(".json"):
|
|
filepath = os.path.join(Counterfactual_Robustness_DIR, filename)
|
|
with open(filepath, "r") as f:
|
|
score = json.load(f)
|
|
model = filename.split("_")[1]
|
|
|
|
|
|
models.add(model)
|
|
score_data[model] = {
|
|
"Accuracy (%)": int(score["all_rate"] * 100),
|
|
"Error Detection Rate": int(score["reject_rate"] * 10),
|
|
"Correction Rate (%)": round(score["correct_rate"] * 100, 2)
|
|
}
|
|
|
|
|
|
df = pd.DataFrame([
|
|
{
|
|
"Model": model,
|
|
"Accuracy (%)": score_data.get(model, {}).get("Accuracy (%)", "N/A"),
|
|
"Error Detection Rate": score_data.get(model, {}).get("Error Detection Rate", "N/A"),
|
|
"Correction Rate (%)": f"{score_data.get(model, {}).get('Correction Rate (%)', 'N/A'):.2f}"
|
|
}
|
|
for model in sorted(models)
|
|
])
|
|
|
|
return df
|
|
|
|
|
|
def launch_gradio_app(config):
|
|
with gr.Blocks() as app:
|
|
app.title = "RAG System Evaluation"
|
|
gr.Markdown("# RAG System Evaluation on RGB Dataset")
|
|
|
|
|
|
with gr.Row():
|
|
model_name_input = gr.Dropdown(
|
|
label="Model Name",
|
|
choices= config["models"],
|
|
value="llama3-8b-8192",
|
|
interactive=True
|
|
)
|
|
noise_rate_input = gr.Slider(label="Noise Rate", minimum=0, maximum=1.0, step=0.2, value=0.2, interactive=True)
|
|
num_queries_input = gr.Number(label="Number of Queries", value=50, interactive=True)
|
|
|
|
|
|
with gr.Row():
|
|
recalculate_noise_btn = gr.Button("Evaluate Noise Robustness")
|
|
recalculate_negative_btn = gr.Button("Evaluate Negative Rejection")
|
|
recalculate_counterfactual_btn = gr.Button("Evaluate Counterfactual Robustness")
|
|
recalculate_integration_btn = gr.Button("Evaluate Integration Information")
|
|
|
|
with gr.Row():
|
|
refresh_btn = gr.Button("Refresh", variant="primary", scale = 0)
|
|
|
|
|
|
with gr.Row():
|
|
with gr.Column():
|
|
gr.Markdown("### π Noise Robustness\n**Description:** The experimental result of noise robustness measured by accuracy (%) under different noise ratios. Result show that the increasing noise rate poses a challenge for RAG in LLMs.")
|
|
noise_table = gr.Dataframe(value=load_scores(Noise_Robustness_DIR), interactive=False)
|
|
with gr.Column():
|
|
gr.Markdown("### π« Negative Rejection\n**Description:** This measures the model's ability to reject invalid or nonsensical queries instead of generating incorrect responses. A higher rejection rate means the model is better at filtering unreliable inputs.")
|
|
rejection_table = gr.Dataframe(value=load_negative_rejection_scores(), interactive=False)
|
|
|
|
with gr.Row():
|
|
with gr.Column():
|
|
gr.Markdown("""
|
|
### π Counterfactual Robustness
|
|
**Description:**
|
|
Counterfactual Robustness evaluates a model's ability to handle **errors in external knowledge** while ensuring reliable responses.
|
|
|
|
**Key Metrics in this Report:**
|
|
- **Accuracy (%)** β Measures the accuracy (%) of LLMs with counterfactual documents.
|
|
- **Error Detection Rate (%)** β Measures how often the model **rejects** incorrect or misleading queries instead of responding.
|
|
- **Correct Rate (%)** β Measures how often the model provides accurate responses despite **potential misinformation**.
|
|
""")
|
|
counter_factual_table = gr.Dataframe(value=load_counterfactual_robustness_scores(), interactive=False)
|
|
with gr.Column():
|
|
gr.Markdown("### π§ Information Integration\n**Description:** The experimental result of information integration measured by accuracy (%) under different noise ratios. The result show that information integration poses a challenge for RAG in LLMs")
|
|
integration_table = gr.Dataframe(value=load_scores(Infomration_Integration_DIR), interactive=False)
|
|
|
|
|
|
|
|
def refresh_scores():
|
|
return load_scores(Noise_Robustness_DIR), load_negative_rejection_scores(), load_counterfactual_robustness_scores(), load_scores(Infomration_Integration_DIR)
|
|
|
|
refresh_btn.click(refresh_scores, outputs=[noise_table, rejection_table, counter_factual_table, integration_table])
|
|
|
|
|
|
def recalculate_noise_robustness(model_name, noise_rate, num_queries):
|
|
update_config(config, model_name, noise_rate, num_queries)
|
|
evaluate_noise_robustness(config)
|
|
return load_scores(Noise_Robustness_DIR)
|
|
|
|
recalculate_noise_btn.click(recalculate_noise_robustness, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[noise_table])
|
|
|
|
def recalculate_counterfactual_robustness(model_name, noise_rate, num_queries):
|
|
update_config(config, model_name, noise_rate, num_queries)
|
|
evaluate_factual_robustness(config)
|
|
return load_counterfactual_robustness_scores()
|
|
|
|
recalculate_counterfactual_btn.click(recalculate_counterfactual_robustness, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[counter_factual_table])
|
|
|
|
def recalculate_negative_rejection(model_name, noise_rate, num_queries):
|
|
update_config(config, model_name, noise_rate, num_queries)
|
|
evaluate_negative_rejection(config)
|
|
return load_negative_rejection_scores()
|
|
|
|
recalculate_negative_btn.click(recalculate_negative_rejection, inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[rejection_table])
|
|
|
|
def recalculate_integration_info(model_name, noise_rate, num_queries):
|
|
update_config(config, model_name, noise_rate, num_queries)
|
|
evaluate_information_integration(config)
|
|
return load_scores(Infomration_Integration_DIR)
|
|
|
|
recalculate_integration_btn.click(recalculate_integration_info , inputs=[model_name_input, noise_rate_input, num_queries_input], outputs=[integration_table])
|
|
|
|
app.launch()
|
|
|