Hasan Iqbal commited on
Commit
aea72ae
·
unverified ·
1 Parent(s): 27f728f

LLM Response Evaluation Finalized

Browse files
src/openfactcheck/app/evaluate_response.py CHANGED
@@ -6,6 +6,16 @@ import streamlit as st
6
  from openfactcheck.core.base import OpenFactCheck
7
  from openfactcheck.app.utils import metric_card
8
 
 
 
 
 
 
 
 
 
 
 
9
  # Create a function to check a LLM response
10
  def evaluate_response(ofc: OpenFactCheck):
11
  """
@@ -77,22 +87,13 @@ def evaluate_response(ofc: OpenFactCheck):
77
  # Get the number of detected claims
78
  detected_claims = output_text.get("claims", [])
79
 
80
- def extract_text(claim):
81
- """
82
- Extracts text from a claim that might be a string formatted as a dictionary.
83
- """
84
- # Try to extract text using regular expression if claim is a string formatted as a dictionary
85
- match = re.search(r"'text': '([^']+)'", claim)
86
- if match:
87
- return match.group(1)
88
- return claim # Return as is if no dictionary format detected
89
-
90
  # Generate formatted text with enumerated claims in Markdown format
91
- formatted_text = "#### Detected Claims\n" + "\n".join(f"{i}. {extract_text(claim)}" for i, claim in enumerate(detected_claims, start=1)) + "\n"
 
 
92
 
93
  with col2:
94
- with st.container():
95
- metric_card(label="Detected Claims", value=len(detected_claims))
96
 
97
  # Yield each word with a space and simulate typing by sleeping
98
  for word in formatted_text.split(" "):
@@ -110,24 +111,110 @@ def evaluate_response(ofc: OpenFactCheck):
110
  for evidence in claim_with_evidences:
111
  evidences.append(evidence[1])
112
 
113
- # Generate formatted text with enumerated evidences in Markdown format
114
- formatted_text = "#### Retrieved Evidences\n" + "\n".join(f"{i}. {evidence}" for i, evidence in enumerate(evidences, start=1))
 
 
115
 
116
  with col2:
117
- with st.container():
118
- metric_card(label="Retrieved Evidences", value=len(evidences))
119
 
120
- # Yield each word with a space and simulate typing by sleeping
121
- for word in formatted_text.split(" "):
122
- yield word + " "
123
- time.sleep(0.01)
124
 
125
  elif "verifier" in response["solver_name"]:
126
  # Extract response details
127
  output_text = response["output"]
128
 
129
- # Store the final response in the session state
130
- st.session_state.final_response = output_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  # Yield each word with a space and simulate typing by sleeping
133
  for word in formatted_text.split(" "):
@@ -135,14 +222,4 @@ def evaluate_response(ofc: OpenFactCheck):
135
  time.sleep(0.01)
136
 
137
  st.write_stream(process_stream(response))
138
-
139
- # Process the final response
140
- final_response = st.session_state.final_response
141
- if final_response is not None:
142
- overall_factuality = final_response.get("label", "Unknown")
143
- with col2:
144
- with st.container():
145
- if overall_factuality == True:
146
- metric_card(label="Overall Factuality", value="True", background_color="#D4EDDA", border_left_color="#28A745")
147
- elif overall_factuality == False:
148
- metric_card(label="Overall Factuality", value="False", background_color="#F8D7DA", border_left_color="#DC3545")
 
6
  from openfactcheck.core.base import OpenFactCheck
7
  from openfactcheck.app.utils import metric_card
8
 
9
+ def extract_text(claim):
10
+ """
11
+ Extracts text from a claim that might be a string formatted as a dictionary.
12
+ """
13
+ # Try to extract text using regular expression if claim is a string formatted as a dictionary
14
+ match = re.search(r"'text': '([^']+)'", claim)
15
+ if match:
16
+ return match.group(1)
17
+ return claim # Return as is if no dictionary format detected
18
+
19
  # Create a function to check a LLM response
20
  def evaluate_response(ofc: OpenFactCheck):
21
  """
 
87
  # Get the number of detected claims
88
  detected_claims = output_text.get("claims", [])
89
 
 
 
 
 
 
 
 
 
 
 
90
  # Generate formatted text with enumerated claims in Markdown format
91
+ formatted_text = "### Detected Claims\n"
92
+ formatted_text += "\n".join(f"{i}. {extract_text(claim)}" for i, claim in enumerate(detected_claims, start=1))
93
+ formatted_text += "\n"
94
 
95
  with col2:
96
+ metric_card(label="Detected Claims", value=len(detected_claims))
 
97
 
98
  # Yield each word with a space and simulate typing by sleeping
99
  for word in formatted_text.split(" "):
 
111
  for evidence in claim_with_evidences:
112
  evidences.append(evidence[1])
113
 
114
+ # # Generate formatted text with enumerated evidences in Markdown format
115
+ # formatted_text = "#### Retrieved Evidences\n"
116
+ # formatted_text += "\n".join(f"{i}. {evidence}" for i, evidence in enumerate(evidences, start=1))
117
+ # formatted_text += "\n"
118
 
119
  with col2:
120
+ metric_card(label="Retrieved Evidences", value=len(evidences))
 
121
 
122
+ # # Yield each word with a space and simulate typing by sleeping
123
+ # for word in formatted_text.split(" "):
124
+ # yield word + " "
125
+ # time.sleep(0.01)
126
 
127
  elif "verifier" in response["solver_name"]:
128
  # Extract response details
129
  output_text = response["output"]
130
 
131
+ # Get detail
132
+ details = output_text.get("detail", None)
133
+ if details is None:
134
+ detail_text = "The verifier did not provide any detail. Please use other verifiers for more information."
135
+ else:
136
+ detail_text = ""
137
+
138
+ # Apply color to the claim based on factuality
139
+ claims=0
140
+ false_claims = 0
141
+ true_claims = 0
142
+ for i, detail in enumerate(details):
143
+ if detail.get("factuality", None) is not None:
144
+ claim=detail.get("claim", "")
145
+ if detail.get("factuality", None) == -1:
146
+ detail_text += f'##### :red[{str(i+1) + ". " + extract_text(claim)}]'
147
+ detail_text += "\n"
148
+ claims += 1
149
+ false_claims += 1
150
+ elif detail.get("factuality", None) == 1:
151
+ detail_text += f'##### :green[{str(i+1) + ". " + extract_text(claim)}]'
152
+ detail_text += "\n"
153
+ claims += 1
154
+ true_claims += 1
155
+ else:
156
+ detail_text += f'##### :yellow[{str(i+1) + ". " + extract_text(claim)}]'
157
+ detail_text += "\n"
158
+ claims += 1
159
+ else:
160
+ st.error("Factuality not found in the verifier output.")
161
+
162
+ # Add error information
163
+ if detail.get("error", None) is not "None":
164
+ detail_text += f"- **Error**: {detail.get('error', '')}"
165
+ detail_text += "\n"
166
+
167
+ # Add reasoning information
168
+ if detail.get("reasoning", None) is not "None":
169
+ detail_text += f"- **Reasoning**: {detail.get('reasoning', '')}"
170
+ detail_text += "\n"
171
+
172
+ # Add correction
173
+ if detail.get("correction", None) is not "":
174
+ detail_text += f"- **Correction**: {detail.get('correction', '')}"
175
+ detail_text += "\n"
176
+
177
+ # Add evidence
178
+ if detail.get("evidence", None) is not "":
179
+ evidence_text = ""
180
+ for evidence in detail.get("evidences", []):
181
+ evidence_text += f" - {evidence[1]}"
182
+ evidence_text += "\n"
183
+ detail_text += f"- **Evidence**:\n{evidence_text}"
184
+
185
+
186
+ # Generate formatted text with the overall factuality in Markdown format
187
+ formatted_text = "### Factuality Detail\n"
188
+ formatted_text += "Factuality of each claim is color-coded (red:[red means false], green:[green means true], yellow:[yellow means unknown]) as follows:\n"
189
+ formatted_text += f"{detail_text}\n"
190
+ formatted_text += "\n"
191
+
192
+ # Get the number of true and false claims
193
+ with col2:
194
+ metric_card(label="Supported Claims", value=true_claims, background_color="#D1ECF1", border_left_color="#17A2B8")
195
+ metric_card(label="Conflicted Claims", value=false_claims, background_color="#D1ECF1", border_left_color="#17A2B8")
196
+
197
+ # Get overall factuality (label)
198
+ overall_factuality = output_text.get("label", "Unknown")
199
+ with col2:
200
+ with st.container():
201
+ if overall_factuality == True:
202
+ metric_card(label="Overall Factuality", value="True", background_color="#D4EDDA", border_left_color="#28A745")
203
+ elif overall_factuality == False:
204
+ metric_card(label="Overall Factuality", value="False", background_color="#F8D7DA", border_left_color="#DC3545")
205
+
206
+ # Get overall credibility (score)
207
+ overall_credibility = true_claims / claims if claims > 0 else 0
208
+ with col2:
209
+ if overall_credibility > 0.75 and overall_credibility <= 1:
210
+ # Green background
211
+ metric_card(label="Overall Credibility", value=f"{overall_credibility:.2%}", background_color="#D4EDDA", border_left_color="#28A745")
212
+ elif overall_credibility > 0.25 and overall_credibility <= 0.75:
213
+ # Yellow background
214
+ metric_card(label="Overall Credibility", value=f"{overall_credibility:.2%}", background_color="#FFF3CD", border_left_color="#FFC107")
215
+ else:
216
+ # Red background
217
+ metric_card(label="Overall Credibility", value=f"{overall_credibility:.2%}", background_color="#F8D7DA", border_left_color="#DC3545")
218
 
219
  # Yield each word with a space and simulate typing by sleeping
220
  for word in formatted_text.split(" "):
 
222
  time.sleep(0.01)
223
 
224
  st.write_stream(process_stream(response))
225
+