File size: 7,850 Bytes
eca534f ec53a03 eca534f 48d16d8 eca534f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
import os
import uuid
import zipfile
import pandas as pd
import streamlit as st
from importlib import resources as pkg_resources
from openfactcheck.app.utils import metric_card
from openfactcheck import OpenFactCheck
from openfactcheck.templates import llm as templates_dir
# Import solver configuration templates
questions_templates_path = str(pkg_resources.files(templates_dir) / "questions.csv")
def evaluate_llm(ofc: OpenFactCheck):
"""
This function creates a Streamlit app to evaluate the factuality of a LLM.
"""
# Initialize the LLM Evaluator
llm_evaluator = ofc.LLMEvaluator
st.write("This is where you can evaluate the factuality of a LLM.")
# Display the instructions
st.write("Download the questions and instructions to evaluate the factuality of a LLM.")
# Check if the file exists
if os.path.exists(questions_templates_path):
# Create a ZIP file in memory
from io import BytesIO
memory_file = BytesIO()
with zipfile.ZipFile(memory_file, 'w') as zf:
# Define the name of the file within the ZIP archive
zip_path = os.path.basename(questions_templates_path) # 'questions.csv'
# Add file to the ZIP file
zf.write(questions_templates_path, arcname=zip_path)
# Reset pointer to start of the memory file
memory_file.seek(0)
# Create a download button and the file will be downloaded when clicked
btn = st.download_button(
label="Download",
data=memory_file,
file_name="openfactcheck_llm_benchmark.zip",
mime="application/zip"
)
else:
st.error("File not found.")
# Display the instructions
st.write("Upload the model responses as a JSON file below to evaluate the factuality.")
# Upload the model output
uploaded_file = st.file_uploader("Upload", type=["csv"], label_visibility="collapsed")
# Check if the file is uploaded
if uploaded_file is None:
st.info("Please upload a CSV file.")
return
# Check if the file is a CSV file
if uploaded_file.type != "text/csv":
st.error("Invalid file format. Please upload a CSV file.")
return
# Read the CSV file
uploaded_data = pd.read_csv(uploaded_file)
# Ask user to select datasets they want to evaluate on
st.write("Please select the datasets you want to evaluate the LLM on.")
datasets = st.multiselect("Select datasets", ["snowballing", "selfaware", "freshqa", "factoolqa", "felm-wk", "factcheck-bench", "factscore-bio"])
def update_first_name():
st.session_state.first_name = st.session_state.input_first_name
def update_last_name():
st.session_state.last_name = st.session_state.input_last_name
def update_email():
st.session_state.email = st.session_state.input_email
def update_organization():
st.session_state.organization = st.session_state.input_organization
def update_llm_model():
st.session_state.llm_model = st.session_state.input_llm_model
def update_include_in_leaderboard():
st.session_state.include_in_leaderboard = st.session_state.input_include_in_leaderboard
# Display instructions
st.write("Please provide the following information to be included in the leaderboard.")
# Create text inputs to enter the user information
st.session_state.id = llm_evaluator.run_id
st.text_input("First Name", key="input_first_name", on_change=update_first_name)
st.text_input("Last Name", key="input_last_name", on_change=update_last_name)
st.text_input("Email", key="input_email", on_change=update_email)
st.text_input("LLM Model Name", key="input_llm_model", on_change=update_llm_model)
st.text_input("Organization (Optional)", key="input_organization", on_change=update_organization)
# Create a checkbox to include the user in the leaderboard
st.checkbox("Please check this box if you want your LLM to be included in the leaderboard.",
key="input_include_in_leaderboard",
on_change=update_include_in_leaderboard)
if st.button("Evaluate LLM"):
# Display a success message
st.success("User information saved successfully.")
# Display an information message
st.info(f"""Please wait while we evaluate the factuality of the LLM.
You will be able to download the evaluation report shortly, if you can wait. The report will also be delivered to your email address.
Please note your ID {st.session_state.id}, This will be used to track your evaluation.
If the report is not available, please contact the administrator and provide your ID.""")
# Display a waiting message
with st.status("Evaluating factuality of the LLM...", expanded=True) as status:
# Evaluate the LLM
results = llm_evaluator.evaluate(model_name=st.session_state.llm_model,
input_path=uploaded_data,
datasets=datasets,
save_report=False)
# Get plots
st.write("Generating plots...")
plots = llm_evaluator.generate_plots(save_plots=False)
# Generate the evaluation report
st.write("Generating evaluation report...")
report_path = llm_evaluator.generate_report(report_path=f"{llm_evaluator.output_path}/{llm_evaluator.run_id}")
status.update(label="LLM evaluated...", state="complete", expanded=False)
# Display the plots
st.write("### Evaluation Report")
# If snowballing dataset is selected
if "snowballing" in datasets:
st.write("#### Evaluation on Snowballing Dataset")
col1, col2, col3 = st.columns(3)
with col1:
st.pyplot(plots["snowballing"]["barplot"])
with col2:
st.pyplot(plots["snowballing"]["cm"])
with col3:
pass
# If selfaware dataset is selected
if "selfaware" in datasets:
st.write("#### Evaluation on SelfAware Dataset")
col1, col2, col3 = st.columns(3)
with col1:
st.pyplot(plots["selfaware"]["barplot"])
with col2:
st.pyplot(plots["selfaware"]["cm"])
with col3:
pass
# If freshqa dataset is selected
if "freshqa" in datasets:
st.write("#### Evaluation on FreshQA Dataset")
col1, col2, col3 = st.columns(3)
with col1:
st.pyplot(plots["freshqa"]["piechart"])
with col2:
pass
with col3:
pass
# If any of the free-text datasets are selected
if any(dataset in ["factoolqa", "felm-wk", "factcheck-bench", "factscore-bio"] for dataset in datasets):
st.write("#### Evaluation on Free-Text Datasets")
st.pyplot(plots["freetext"]["barplot"])
# Generate the evaluation report
st.write("### Download Evaluation Report")
st.info("The report will also be sent to your email address.")
# Load the evaluation report
if os.path.exists(report_path):
with open(report_path, "rb") as file:
report_bytes = file.read()
# Display the download button
st.download_button(
label="Download",
data=report_bytes,
file_name="llm_evaluation_report.pdf",
mime="application/pdf"
)
else:
st.error("File not found.")
|