Files changed (1) hide show
  1. metrics_v2.py +63 -1
metrics_v2.py CHANGED
@@ -239,4 +239,66 @@ if __name__ == "__main__":
239
 
240
  print("\n----- Evaluation Result -----")
241
  print(f"Review Flag: {evaluation_result['review_flag']}")
242
- print(f"Explanation: {evaluation_result['explanation']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
  print("\n----- Evaluation Result -----")
241
  print(f"Review Flag: {evaluation_result['review_flag']}")
242
+ print(f"Explanation: {evaluation_result['explanation']}")
243
+
244
+
245
+ #######
246
+ from typing import List, Tuple, Callable
247
+
248
+ def evaluate_retrieval_precision(
249
+ questions: List[str],
250
+ system: Callable[[str], List[str]],
251
+ evaluator: Callable[[str, str], int],
252
+ num_chunks_expected: int = 3,
253
+ verbose: bool = True
254
+ ) -> dict:
255
+ """
256
+ Evaluates the retrieval precision of a system using an LLM evaluator.
257
+
258
+ Args:
259
+ questions: A list of evaluation questions.
260
+ system: A function that takes a question as input and returns a list of retrieved chunks.
261
+ evaluator: A function that takes a question and a chunk as input and returns a relevance score (0 or 1).
262
+ num_chunks_expected: The number of chunks the system is expected to return. Defaults to 3.
263
+ verbose: Whether to print warnings for questions with fewer returned chunks than expected.
264
+
265
+ Returns:
266
+ A dictionary containing:
267
+ - 'mean_precision': The mean retrieval precision score across all questions.
268
+ - 'precision_scores': A list of precision scores for each individual question.
269
+ - 'question_relevance': A list of tuples, where each tuple contains a question and the number of relevant chunks retrieved for that question.
270
+
271
+ """
272
+
273
+ results = {
274
+ 'mean_precision': 0.0,
275
+ 'precision_scores': [],
276
+ 'question_relevance': []
277
+ }
278
+
279
+ for i, question in enumerate(questions):
280
+ retrieved_chunks = system(question)
281
+
282
+ # Warning if fewer chunks are returned than expected
283
+ if len(retrieved_chunks) < num_chunks_expected and verbose:
284
+ print(f"Warning: System returned {len(retrieved_chunks)} chunks (expected {num_chunks_expected}) for question {i+1}: {question}")
285
+
286
+ # Calculate precision for the current question
287
+ relevant_chunks = sum(evaluator(question, chunk) for chunk in retrieved_chunks)
288
+ precision = relevant_chunks / len(retrieved_chunks) if retrieved_chunks else 0
289
+ results['precision_scores'].append(precision)
290
+
291
+ # Store the question and its relevant chunk count
292
+ results['question_relevance'].append((question, relevant_chunks))
293
+
294
+ # Calculate mean precision
295
+ results['mean_precision'] = sum(results['precision_scores']) / len(questions) if questions else 0
296
+ return results
297
+
298
+ # Example usage (assuming you've defined 'questions', 'system', and 'evaluator'):
299
+ evaluation_results = evaluate_retrieval_precision(
300
+ questions, system, evaluator, num_chunks_expected=3, verbose=True
301
+ )
302
+ print(f"Mean Retrieval Precision: {evaluation_results['mean_precision']:.2f}")
303
+ print(f"Precision Scores for Each Question: {evaluation_results['precision_scores']}")
304
+ print(f"Question Relevance: {evaluation_results['question_relevance']}")