File size: 7,850 Bytes
eca534f
 
 
 
 
 
 
 
ec53a03
 
eca534f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48d16d8
eca534f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import os
import uuid
import zipfile
import pandas as pd
import streamlit as st
from importlib import resources as pkg_resources

from openfactcheck.app.utils import metric_card

from openfactcheck import OpenFactCheck
from openfactcheck.templates import llm as templates_dir

# Import solver configuration templates
questions_templates_path = str(pkg_resources.files(templates_dir) / "questions.csv")

def evaluate_llm(ofc: OpenFactCheck):
    """
    This function creates a Streamlit app to evaluate the factuality of a LLM.
    """
    # Initialize the LLM Evaluator
    llm_evaluator = ofc.LLMEvaluator
    
    st.write("This is where you can evaluate the factuality of a LLM.")

    # Display the instructions
    st.write("Download the questions and instructions to evaluate the factuality of a LLM.")

    # Check if the file exists
    if os.path.exists(questions_templates_path):
        # Create a ZIP file in memory
        from io import BytesIO
        memory_file = BytesIO()
        with zipfile.ZipFile(memory_file, 'w') as zf:
            # Define the name of the file within the ZIP archive
            zip_path = os.path.basename(questions_templates_path)  # 'questions.csv'
            # Add file to the ZIP file
            zf.write(questions_templates_path, arcname=zip_path)
        
        # Reset pointer to start of the memory file
        memory_file.seek(0)

        # Create a download button and the file will be downloaded when clicked
        btn = st.download_button(
            label="Download",
            data=memory_file,
            file_name="openfactcheck_llm_benchmark.zip",
            mime="application/zip"
        )
    else:
        st.error("File not found.")

    # Display the instructions
    st.write("Upload the model responses as a JSON file below to evaluate the factuality.")

    # Upload the model output
    uploaded_file = st.file_uploader("Upload", type=["csv"], label_visibility="collapsed")

    # Check if the file is uploaded
    if uploaded_file is None:
        st.info("Please upload a CSV file.")
        return
    
    # Check if the file is a CSV file
    if uploaded_file.type != "text/csv":
        st.error("Invalid file format. Please upload a CSV file.")
        return

    # Read the CSV file
    uploaded_data = pd.read_csv(uploaded_file)

    # Ask user to select datasets they want to evaluate on
    st.write("Please select the datasets you want to evaluate the LLM on.")
    datasets = st.multiselect("Select datasets", ["snowballing", "selfaware", "freshqa", "factoolqa", "felm-wk", "factcheck-bench", "factscore-bio"])

    def update_first_name():
        st.session_state.first_name = st.session_state.input_first_name

    def update_last_name():
        st.session_state.last_name = st.session_state.input_last_name

    def update_email():
        st.session_state.email = st.session_state.input_email

    def update_organization():
        st.session_state.organization = st.session_state.input_organization

    def update_llm_model():
        st.session_state.llm_model = st.session_state.input_llm_model

    def update_include_in_leaderboard():
        st.session_state.include_in_leaderboard = st.session_state.input_include_in_leaderboard

    # Display instructions
    st.write("Please provide the following information to be included in the leaderboard.")

    # Create text inputs to enter the user information
    st.session_state.id = llm_evaluator.run_id
    st.text_input("First Name", key="input_first_name", on_change=update_first_name)
    st.text_input("Last Name", key="input_last_name", on_change=update_last_name)
    st.text_input("Email", key="input_email", on_change=update_email)
    st.text_input("LLM Model Name", key="input_llm_model", on_change=update_llm_model)
    st.text_input("Organization (Optional)", key="input_organization", on_change=update_organization)

    # Create a checkbox to include the user in the leaderboard
    st.checkbox("Please check this box if you want your LLM to be included in the leaderboard.", 
                key="input_include_in_leaderboard", 
                on_change=update_include_in_leaderboard)

    if st.button("Evaluate LLM"):
        # Display a success message
        st.success("User information saved successfully.")

        # Display an information message
        st.info(f"""Please wait while we evaluate the factuality of the LLM.
You will be able to download the evaluation report shortly, if you can wait. The report will also be delivered to your email address.
                
Please note your ID {st.session_state.id}, This will be used to track your evaluation.
If the report is not available, please contact the administrator and provide your ID.""")

        # Display a waiting message
        with st.status("Evaluating factuality of the LLM...", expanded=True) as status:
            # Evaluate the LLM
            results = llm_evaluator.evaluate(model_name=st.session_state.llm_model,
                                             input_path=uploaded_data,
                                             datasets=datasets, 
                                             save_report=False)
            
            # Get plots
            st.write("Generating plots...")
            plots = llm_evaluator.generate_plots(save_plots=False)

            # Generate the evaluation report
            st.write("Generating evaluation report...")
            report_path = llm_evaluator.generate_report(report_path=f"{llm_evaluator.output_path}/{llm_evaluator.run_id}")

            status.update(label="LLM evaluated...", state="complete", expanded=False)

        # Display the plots
        st.write("### Evaluation Report")

        # If snowballing dataset is selected
        if "snowballing" in datasets:
            st.write("#### Evaluation on Snowballing Dataset")
            col1, col2, col3 = st.columns(3)
            with col1:
                st.pyplot(plots["snowballing"]["barplot"])
            with col2:
                st.pyplot(plots["snowballing"]["cm"])
            with col3:
                pass

        # If selfaware dataset is selected
        if "selfaware" in datasets:
            st.write("#### Evaluation on SelfAware Dataset")
            col1, col2, col3 = st.columns(3)
            with col1:
                st.pyplot(plots["selfaware"]["barplot"])
            with col2:
                st.pyplot(plots["selfaware"]["cm"])
            with col3:
                pass    
        
        # If freshqa dataset is selected
        if "freshqa" in datasets:
            st.write("#### Evaluation on FreshQA Dataset")
            col1, col2, col3 = st.columns(3)
            with col1:
                st.pyplot(plots["freshqa"]["piechart"])
            with col2:
                pass
            with col3:
                pass   
        
        # If any of the free-text datasets are selected
        if any(dataset in ["factoolqa", "felm-wk", "factcheck-bench", "factscore-bio"] for dataset in datasets):
            st.write("#### Evaluation on Free-Text Datasets")
            st.pyplot(plots["freetext"]["barplot"])
    
        # Generate the evaluation report
        st.write("### Download Evaluation Report")
        st.info("The report will also be sent to your email address.")

        # Load the evaluation report
        if os.path.exists(report_path):
            with open(report_path, "rb") as file:
                report_bytes = file.read()
                
                # Display the download button
                st.download_button(
                    label="Download",
                    data=report_bytes,
                    file_name="llm_evaluation_report.pdf",
                    mime="application/pdf"
                )
        else:
            st.error("File not found.")