Hasan Iqbal commited on
Commit
e75b352
·
unverified ·
1 Parent(s): 5574116

Added Questions for the evidences

Browse files
src/openfactcheck/app/evaluate_response.py CHANGED
@@ -7,6 +7,7 @@ import streamlit as st
7
  from openfactcheck.base import OpenFactCheck
8
  from openfactcheck.app.utils import metric_card
9
 
 
10
  def extract_text(claim):
11
  """
12
  Extracts text from a claim that might be a string formatted as a dictionary.
@@ -17,6 +18,7 @@ def extract_text(claim):
17
  return match.group(1)
18
  return claim # Return as is if no dictionary format detected
19
 
 
20
  # Create a function to check a LLM response
21
  def evaluate_response(ofc: OpenFactCheck):
22
  """
@@ -40,32 +42,52 @@ def evaluate_response(ofc: OpenFactCheck):
40
  col1, col2, col3 = st.columns(3)
41
  with col1:
42
  if "claimprocessor" not in st.session_state:
43
- st.session_state.claimprocessor = st.selectbox("Select Claim Processor", list(st.session_state.claimprocessors))
 
 
44
  else:
45
- st.session_state.claimprocessor = st.selectbox("Select Claim Processor", list(st.session_state.claimprocessors), index=list(st.session_state.claimprocessors).index(st.session_state.claimprocessor))
 
 
 
 
46
  with col2:
47
  if "retriever" not in st.session_state:
48
  st.session_state.retriever = st.selectbox("Select Retriever", list(st.session_state.retrievers))
49
  else:
50
- st.session_state.retriever = st.selectbox("Select Retriever", list(st.session_state.retrievers), index=list(st.session_state.retrievers).index(st.session_state.retriever))
 
 
 
 
51
  with col3:
52
  if "verifier" not in st.session_state:
53
  st.session_state.verifier = st.selectbox("Select Verifier", list(st.session_state.verifiers))
54
  else:
55
- st.session_state.verifier = st.selectbox("Select Verifier", list(st.session_state.verifiers), index=list(st.session_state.verifiers).index(st.session_state.verifier))
 
 
 
 
56
 
57
  # Input
58
  if "input_text" not in st.session_state:
59
- st.session_state.input_text = {"text": st.text_area("Enter LLM response here", "This is a sample LLM response.")}
 
 
60
  else:
61
- st.session_state.input_text = {"text": st.text_area("Enter LLM response here", st.session_state.input_text["text"])}
 
 
62
 
63
  # Button to check factuality
64
  if st.button("Check Factuality"):
65
  with st.status("Checking factuality...", expanded=True) as status:
66
  # Configure the pipeline
67
  st.write("Configuring pipeline...")
68
- ofc.init_pipeline_manually([st.session_state.claimprocessor, st.session_state.retriever, st.session_state.verifier])
 
 
69
  st.write("Pipeline configured...")
70
 
71
  # Evaluate the response
@@ -77,7 +99,9 @@ def evaluate_response(ofc: OpenFactCheck):
77
  status.update(label="Factuality checked...", state="complete", expanded=False)
78
 
79
  # Display pipeline configuration
80
- pipeline_str = "   ┈➤   ".join([st.session_state.claimprocessor, st.session_state.retriever, st.session_state.verifier])
 
 
81
  st.info(f"""**Pipeline**:    \n{pipeline_str}""")
82
 
83
  # Store the final response in the session state
@@ -85,6 +109,7 @@ def evaluate_response(ofc: OpenFactCheck):
85
 
86
  col1, col2 = st.columns([3, 1])
87
  with col1:
 
88
  def process_stream(responses):
89
  """
90
  Process each response from the stream as a simulated chat output.
@@ -102,7 +127,9 @@ def evaluate_response(ofc: OpenFactCheck):
102
 
103
  # Generate formatted text with enumerated claims in Markdown format
104
  formatted_text = "### Detected Claims\n"
105
- formatted_text += "\n".join(f"{i}. {extract_text(claim)}" for i, claim in enumerate(detected_claims, start=1))
 
 
106
  formatted_text += "\n"
107
 
108
  with col2:
@@ -119,24 +146,16 @@ def evaluate_response(ofc: OpenFactCheck):
119
  # Extract response details
120
  output_text = response["output"]
121
 
 
122
  evidences = []
123
  for _, claim_with_evidences in output_text.get("claims_with_evidences", {}).items():
124
- for evidence in claim_with_evidences:
125
- evidences.append(evidence[1])
126
-
127
- # # Generate formatted text with enumerated evidences in Markdown format
128
- # formatted_text = "#### Retrieved Evidences\n"
129
- # formatted_text += "\n".join(f"{i}. {evidence}" for i, evidence in enumerate(evidences, start=1))
130
- # formatted_text += "\n"
131
 
132
  with col2:
133
  metric_card(label="Retrieved Evidences", value=len(evidences))
134
 
135
- # # Yield each word with a space and simulate typing by sleeping
136
- # for word in formatted_text.split(" "):
137
- # yield word + " "
138
- # time.sleep(0.01)
139
-
140
  elif "verifier" in response["solver_name"]:
141
  # Extract response details
142
  output_text = response["output"]
@@ -149,7 +168,7 @@ def evaluate_response(ofc: OpenFactCheck):
149
  detail_text = ""
150
 
151
  # Apply color to the claim based on factuality
152
- claims=0
153
  false_claims = 0
154
  true_claims = 0
155
  controversial_claims = 0
@@ -158,7 +177,7 @@ def evaluate_response(ofc: OpenFactCheck):
158
  # Get factuality information
159
  factuality = str(detail.get("factuality", None))
160
  if factuality is not None:
161
- claim=detail.get("claim", "")
162
  if factuality == "-1" or factuality == "False":
163
  detail_text += f'##### :red[{str(i+1) + ". " + extract_text(claim)}]'
164
  detail_text += "\n"
@@ -183,29 +202,35 @@ def evaluate_response(ofc: OpenFactCheck):
183
  st.error("Factuality not found in the verifier output.")
184
 
185
  # Add error information
186
- if detail.get("error", None) is not "None":
187
  detail_text += f"- **Error**: {detail.get('error', '')}"
188
  detail_text += "\n"
189
 
190
  # Add reasoning information
191
- if detail.get("reasoning", None) is not "None":
192
  detail_text += f"- **Reasoning**: {detail.get('reasoning', '')}"
193
  detail_text += "\n"
194
-
195
  # Add correction
196
- if detail.get("correction", None) is not "":
197
  detail_text += f"- **Correction**: {detail.get('correction', '')}"
198
  detail_text += "\n"
199
 
200
  # Add evidence
201
- if detail.get("evidence", None) is not "":
202
  evidence_text = ""
 
203
  for evidence in detail.get("evidences", []):
204
- evidence_text += f" - {evidence[1]}"
 
 
 
 
205
  evidence_text += "\n"
206
- detail_text += f"- **Evidence**:\n{evidence_text}"
 
 
207
 
208
-
209
  # Generate formatted text with the overall factuality in Markdown format
210
  formatted_text = "### Factuality Detail\n"
211
  formatted_text += "Factuality of each claim is color-coded (:red[red means false], :green[green means true], :orange[orange means controversial], :violet[violet means unverified]).\n"
@@ -214,32 +239,77 @@ def evaluate_response(ofc: OpenFactCheck):
214
 
215
  # Get the number of true and false claims
216
  with col2:
217
- metric_card(label="Supported Claims", value=true_claims, background_color="#D1ECF1", border_left_color="#17A2B8")
218
- metric_card(label="Conflicted Claims", value=false_claims, background_color="#D1ECF1", border_left_color="#17A2B8")
219
- metric_card(label="Controversial Claims", value=controversial_claims, background_color="#D1ECF1", border_left_color="#17A2B8")
220
- metric_card(label="Unverified Claims", value=unverified_claims, background_color="#D1ECF1", border_left_color="#17A2B8")
221
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  # Get overall factuality (label)
223
  overall_factuality = output_text.get("label", "Unknown")
224
  with col2:
225
  with st.container():
226
- if overall_factuality == True:
227
- metric_card(label="Overall Factuality", value="True", background_color="#D4EDDA", border_left_color="#28A745")
228
- elif overall_factuality == False:
229
- metric_card(label="Overall Factuality", value="False", background_color="#F8D7DA", border_left_color="#DC3545")
 
 
 
 
 
 
 
 
 
 
230
 
231
  # Get overall credibility (score)
232
  overall_credibility = true_claims / claims if claims > 0 else 0
233
  with col2:
234
  if overall_credibility > 0.75 and overall_credibility <= 1:
235
  # Green background
236
- metric_card(label="Overall Credibility", value=f"{overall_credibility:.2%}", background_color="#D4EDDA", border_left_color="#28A745")
 
 
 
 
 
237
  elif overall_credibility > 0.25 and overall_credibility <= 0.75:
238
  # Yellow background
239
- metric_card(label="Overall Credibility", value=f"{overall_credibility:.2%}", background_color="#FFF3CD", border_left_color="#FFC107")
 
 
 
 
 
240
  else:
241
  # Red background
242
- metric_card(label="Overall Credibility", value=f"{overall_credibility:.2%}", background_color="#F8D7DA", border_left_color="#DC3545")
 
 
 
 
 
243
 
244
  # Yield each word with a space and simulate typing by sleeping
245
  for word in formatted_text.split(" "):
@@ -247,4 +317,3 @@ def evaluate_response(ofc: OpenFactCheck):
247
  time.sleep(0.01)
248
 
249
  st.write_stream(process_stream(response))
250
-
 
7
  from openfactcheck.base import OpenFactCheck
8
  from openfactcheck.app.utils import metric_card
9
 
10
+
11
  def extract_text(claim):
12
  """
13
  Extracts text from a claim that might be a string formatted as a dictionary.
 
18
  return match.group(1)
19
  return claim # Return as is if no dictionary format detected
20
 
21
+
22
  # Create a function to check a LLM response
23
  def evaluate_response(ofc: OpenFactCheck):
24
  """
 
42
  col1, col2, col3 = st.columns(3)
43
  with col1:
44
  if "claimprocessor" not in st.session_state:
45
+ st.session_state.claimprocessor = st.selectbox(
46
+ "Select Claim Processor", list(st.session_state.claimprocessors)
47
+ )
48
  else:
49
+ st.session_state.claimprocessor = st.selectbox(
50
+ "Select Claim Processor",
51
+ list(st.session_state.claimprocessors),
52
+ index=list(st.session_state.claimprocessors).index(st.session_state.claimprocessor),
53
+ )
54
  with col2:
55
  if "retriever" not in st.session_state:
56
  st.session_state.retriever = st.selectbox("Select Retriever", list(st.session_state.retrievers))
57
  else:
58
+ st.session_state.retriever = st.selectbox(
59
+ "Select Retriever",
60
+ list(st.session_state.retrievers),
61
+ index=list(st.session_state.retrievers).index(st.session_state.retriever),
62
+ )
63
  with col3:
64
  if "verifier" not in st.session_state:
65
  st.session_state.verifier = st.selectbox("Select Verifier", list(st.session_state.verifiers))
66
  else:
67
+ st.session_state.verifier = st.selectbox(
68
+ "Select Verifier",
69
+ list(st.session_state.verifiers),
70
+ index=list(st.session_state.verifiers).index(st.session_state.verifier),
71
+ )
72
 
73
  # Input
74
  if "input_text" not in st.session_state:
75
+ st.session_state.input_text = {
76
+ "text": st.text_area("Enter LLM response here", "This is a sample LLM response.")
77
+ }
78
  else:
79
+ st.session_state.input_text = {
80
+ "text": st.text_area("Enter LLM response here", st.session_state.input_text["text"])
81
+ }
82
 
83
  # Button to check factuality
84
  if st.button("Check Factuality"):
85
  with st.status("Checking factuality...", expanded=True) as status:
86
  # Configure the pipeline
87
  st.write("Configuring pipeline...")
88
+ ofc.init_pipeline_manually(
89
+ [st.session_state.claimprocessor, st.session_state.retriever, st.session_state.verifier]
90
+ )
91
  st.write("Pipeline configured...")
92
 
93
  # Evaluate the response
 
99
  status.update(label="Factuality checked...", state="complete", expanded=False)
100
 
101
  # Display pipeline configuration
102
+ pipeline_str = "&nbsp;&nbsp;&nbsp;┈➤&nbsp;&nbsp;&nbsp;".join(
103
+ [st.session_state.claimprocessor, st.session_state.retriever, st.session_state.verifier]
104
+ )
105
  st.info(f"""**Pipeline**:&nbsp;&nbsp;&nbsp; \n{pipeline_str}""")
106
 
107
  # Store the final response in the session state
 
109
 
110
  col1, col2 = st.columns([3, 1])
111
  with col1:
112
+
113
  def process_stream(responses):
114
  """
115
  Process each response from the stream as a simulated chat output.
 
127
 
128
  # Generate formatted text with enumerated claims in Markdown format
129
  formatted_text = "### Detected Claims\n"
130
+ formatted_text += "\n".join(
131
+ f"{i}. {extract_text(claim)}" for i, claim in enumerate(detected_claims, start=1)
132
+ )
133
  formatted_text += "\n"
134
 
135
  with col2:
 
146
  # Extract response details
147
  output_text = response["output"]
148
 
149
+ questions = []
150
  evidences = []
151
  for _, claim_with_evidences in output_text.get("claims_with_evidences", {}).items():
152
+ for claim_with_evidence in claim_with_evidences:
153
+ questions.append(claim_with_evidence[0])
154
+ evidences.append(claim_with_evidence[1])
 
 
 
 
155
 
156
  with col2:
157
  metric_card(label="Retrieved Evidences", value=len(evidences))
158
 
 
 
 
 
 
159
  elif "verifier" in response["solver_name"]:
160
  # Extract response details
161
  output_text = response["output"]
 
168
  detail_text = ""
169
 
170
  # Apply color to the claim based on factuality
171
+ claims = 0
172
  false_claims = 0
173
  true_claims = 0
174
  controversial_claims = 0
 
177
  # Get factuality information
178
  factuality = str(detail.get("factuality", None))
179
  if factuality is not None:
180
+ claim = detail.get("claim", "")
181
  if factuality == "-1" or factuality == "False":
182
  detail_text += f'##### :red[{str(i+1) + ". " + extract_text(claim)}]'
183
  detail_text += "\n"
 
202
  st.error("Factuality not found in the verifier output.")
203
 
204
  # Add error information
205
+ if detail.get("error", None) != "None":
206
  detail_text += f"- **Error**: {detail.get('error', '')}"
207
  detail_text += "\n"
208
 
209
  # Add reasoning information
210
+ if detail.get("reasoning", None) != "None":
211
  detail_text += f"- **Reasoning**: {detail.get('reasoning', '')}"
212
  detail_text += "\n"
213
+
214
  # Add correction
215
+ if detail.get("correction", None) != "":
216
  detail_text += f"- **Correction**: {detail.get('correction', '')}"
217
  detail_text += "\n"
218
 
219
  # Add evidence
220
+ if detail.get("evidences", None) != "":
221
  evidence_text = ""
222
+ questions_evidences = {}
223
  for evidence in detail.get("evidences", []):
224
+ if evidence[0] not in questions_evidences:
225
+ questions_evidences[evidence[0]] = []
226
+ questions_evidences[evidence[0]].append(evidence[1])
227
+ for question, evidences in questions_evidences.items():
228
+ evidence_text += f"- **Evidences against Question**: :orange[{question}]"
229
  evidence_text += "\n"
230
+ for evidence in evidences:
231
+ evidence_text += f" - {evidence}\n"
232
+ detail_text += evidence_text
233
 
 
234
  # Generate formatted text with the overall factuality in Markdown format
235
  formatted_text = "### Factuality Detail\n"
236
  formatted_text += "Factuality of each claim is color-coded (:red[red means false], :green[green means true], :orange[orange means controversial], :violet[violet means unverified]).\n"
 
239
 
240
  # Get the number of true and false claims
241
  with col2:
242
+ metric_card(
243
+ label="Supported Claims",
244
+ value=true_claims,
245
+ background_color="#D1ECF1",
246
+ border_left_color="#17A2B8",
247
+ )
248
+ metric_card(
249
+ label="Conflicted Claims",
250
+ value=false_claims,
251
+ background_color="#D1ECF1",
252
+ border_left_color="#17A2B8",
253
+ )
254
+ metric_card(
255
+ label="Controversial Claims",
256
+ value=controversial_claims,
257
+ background_color="#D1ECF1",
258
+ border_left_color="#17A2B8",
259
+ )
260
+ metric_card(
261
+ label="Unverified Claims",
262
+ value=unverified_claims,
263
+ background_color="#D1ECF1",
264
+ border_left_color="#17A2B8",
265
+ )
266
+
267
  # Get overall factuality (label)
268
  overall_factuality = output_text.get("label", "Unknown")
269
  with col2:
270
  with st.container():
271
+ if overall_factuality:
272
+ metric_card(
273
+ label="Overall Factuality",
274
+ value="True",
275
+ background_color="#D4EDDA",
276
+ border_left_color="#28A745",
277
+ )
278
+ elif not overall_factuality:
279
+ metric_card(
280
+ label="Overall Factuality",
281
+ value="False",
282
+ background_color="#F8D7DA",
283
+ border_left_color="#DC3545",
284
+ )
285
 
286
  # Get overall credibility (score)
287
  overall_credibility = true_claims / claims if claims > 0 else 0
288
  with col2:
289
  if overall_credibility > 0.75 and overall_credibility <= 1:
290
  # Green background
291
+ metric_card(
292
+ label="Overall Credibility",
293
+ value=f"{overall_credibility:.2%}",
294
+ background_color="#D4EDDA",
295
+ border_left_color="#28A745",
296
+ )
297
  elif overall_credibility > 0.25 and overall_credibility <= 0.75:
298
  # Yellow background
299
+ metric_card(
300
+ label="Overall Credibility",
301
+ value=f"{overall_credibility:.2%}",
302
+ background_color="#FFF3CD",
303
+ border_left_color="#FFC107",
304
+ )
305
  else:
306
  # Red background
307
+ metric_card(
308
+ label="Overall Credibility",
309
+ value=f"{overall_credibility:.2%}",
310
+ background_color="#F8D7DA",
311
+ border_left_color="#DC3545",
312
+ )
313
 
314
  # Yield each word with a space and simulate typing by sleeping
315
  for word in formatted_text.split(" "):
 
317
  time.sleep(0.01)
318
 
319
  st.write_stream(process_stream(response))