|
import os |
|
import uuid |
|
import zipfile |
|
import pandas as pd |
|
import streamlit as st |
|
from importlib import resources as pkg_resources |
|
|
|
from openfactcheck.app.utils import metric_card |
|
|
|
from openfactcheck import OpenFactCheck |
|
from openfactcheck.templates import llm as templates_dir |
|
|
|
|
|
questions_templates_path = str(pkg_resources.files(templates_dir) / "questions.csv") |
|
|
|
def evaluate_llm(ofc: OpenFactCheck): |
|
""" |
|
This function creates a Streamlit app to evaluate the factuality of a LLM. |
|
""" |
|
|
|
llm_evaluator = ofc.LLMEvaluator |
|
|
|
st.write("This is where you can evaluate the factuality of a LLM.") |
|
|
|
|
|
st.write("Download the questions and instructions to evaluate the factuality of a LLM.") |
|
|
|
|
|
if os.path.exists(questions_templates_path): |
|
|
|
from io import BytesIO |
|
memory_file = BytesIO() |
|
with zipfile.ZipFile(memory_file, 'w') as zf: |
|
|
|
zip_path = os.path.basename(questions_templates_path) |
|
|
|
zf.write(questions_templates_path, arcname=zip_path) |
|
|
|
|
|
memory_file.seek(0) |
|
|
|
|
|
btn = st.download_button( |
|
label="Download", |
|
data=memory_file, |
|
file_name="openfactcheck_llm_benchmark.zip", |
|
mime="application/zip" |
|
) |
|
else: |
|
st.error("File not found.") |
|
|
|
|
|
st.write("Upload the model responses as a JSON file below to evaluate the factuality.") |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload", type=["csv"], label_visibility="collapsed") |
|
|
|
|
|
if uploaded_file is None: |
|
st.info("Please upload a CSV file.") |
|
return |
|
|
|
|
|
if uploaded_file.type != "text/csv": |
|
st.error("Invalid file format. Please upload a CSV file.") |
|
return |
|
|
|
|
|
uploaded_data = pd.read_csv(uploaded_file) |
|
|
|
|
|
st.write("Please select the datasets you want to evaluate the LLM on.") |
|
datasets = st.multiselect("Select datasets", ["snowballing", "selfaware", "freshqa", "factoolqa", "felm-wk", "factcheck-bench", "factscore-bio"]) |
|
|
|
def update_first_name(): |
|
st.session_state.first_name = st.session_state.input_first_name |
|
|
|
def update_last_name(): |
|
st.session_state.last_name = st.session_state.input_last_name |
|
|
|
def update_email(): |
|
st.session_state.email = st.session_state.input_email |
|
|
|
def update_organization(): |
|
st.session_state.organization = st.session_state.input_organization |
|
|
|
def update_llm_model(): |
|
st.session_state.llm_model = st.session_state.input_llm_model |
|
|
|
def update_include_in_leaderboard(): |
|
st.session_state.include_in_leaderboard = st.session_state.input_include_in_leaderboard |
|
|
|
|
|
st.write("Please provide the following information to be included in the leaderboard.") |
|
|
|
|
|
st.session_state.id = llm_evaluator.run_id |
|
st.text_input("First Name", key="input_first_name", on_change=update_first_name) |
|
st.text_input("Last Name", key="input_last_name", on_change=update_last_name) |
|
st.text_input("Email", key="input_email", on_change=update_email) |
|
st.text_input("LLM Model Name", key="input_llm_model", on_change=update_llm_model) |
|
st.text_input("Organization (Optional)", key="input_organization", on_change=update_organization) |
|
|
|
|
|
st.checkbox("Please check this box if you want your LLM to be included in the leaderboard.", |
|
key="input_include_in_leaderboard", |
|
on_change=update_include_in_leaderboard) |
|
|
|
if st.button("Evaluate LLM"): |
|
|
|
st.success("User information saved successfully.") |
|
|
|
|
|
st.info(f"""Please wait while we evaluate the factuality of the LLM. |
|
You will be able to download the evaluation report shortly, if you can wait. The report will also be delivered to your email address. |
|
|
|
Please note your ID {st.session_state.id}, This will be used to track your evaluation. |
|
If the report is not available, please contact the administrator and provide your ID.""") |
|
|
|
|
|
with st.status("Evaluating factuality of the LLM...", expanded=True) as status: |
|
|
|
results = llm_evaluator.evaluate(model_name=st.session_state.llm_model, |
|
input_path=uploaded_data, |
|
datasets=datasets, |
|
save_report=False) |
|
|
|
|
|
st.write("Generating plots...") |
|
plots = llm_evaluator.generate_plots(save_plots=False) |
|
|
|
|
|
st.write("Generating evaluation report...") |
|
report_path = llm_evaluator.generate_report(report_path=f"{llm_evaluator.output_path}/{llm_evaluator.run_id}") |
|
|
|
status.update(label="LLM evaluated...", state="complete", expanded=False) |
|
|
|
|
|
st.write("### Evaluation Report") |
|
|
|
|
|
if "snowballing" in datasets: |
|
st.write("#### Evaluation on Snowballing Dataset") |
|
col1, col2, col3 = st.columns(3) |
|
with col1: |
|
st.pyplot(plots["snowballing"]["barplot"]) |
|
with col2: |
|
st.pyplot(plots["snowballing"]["cm"]) |
|
with col3: |
|
pass |
|
|
|
|
|
if "selfaware" in datasets: |
|
st.write("#### Evaluation on SelfAware Dataset") |
|
col1, col2, col3 = st.columns(3) |
|
with col1: |
|
st.pyplot(plots["selfaware"]["barplot"]) |
|
with col2: |
|
st.pyplot(plots["selfaware"]["cm"]) |
|
with col3: |
|
pass |
|
|
|
|
|
if "freshqa" in datasets: |
|
st.write("#### Evaluation on FreshQA Dataset") |
|
col1, col2, col3 = st.columns(3) |
|
with col1: |
|
st.pyplot(plots["freshqa"]["piechart"]) |
|
with col2: |
|
pass |
|
with col3: |
|
pass |
|
|
|
|
|
if any(dataset in ["factoolqa", "felm-wk", "factcheck-bench", "factscore-bio"] for dataset in datasets): |
|
st.write("#### Evaluation on Free-Text Datasets") |
|
st.pyplot(plots["freetext"]["barplot"]) |
|
|
|
|
|
st.write("### Download Evaluation Report") |
|
st.info("The report will also be sent to your email address.") |
|
|
|
|
|
if os.path.exists(report_path): |
|
with open(report_path, "rb") as file: |
|
report_bytes = file.read() |
|
|
|
|
|
st.download_button( |
|
label="Download", |
|
data=report_bytes, |
|
file_name="llm_evaluation_report.pdf", |
|
mime="application/pdf" |
|
) |
|
else: |
|
st.error("File not found.") |
|
|
|
|
|
|
|
|
|
|