Concepta commited on
Commit
ea93d91
1 Parent(s): 5838405

Update metrics_v2.py

Browse files
Files changed (1) hide show
  1. metrics_v2.py +242 -0
metrics_v2.py CHANGED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
3
+ import torch
4
+ from sentence_transformers import SentenceTransformer, util
5
+ from bert_score import score
6
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
7
+ from rouge import Rouge
8
+ from tqdm import tqdm
9
+ from datasets import load_metric
10
+
11
+ # Download necessary NLTK data
12
+ nltk.download('punkt')
13
+ nltk.download('stopwords')
14
+
15
+ # --- Model and Metric Loading ---
16
+ class ContentEvaluator:
17
+ def __init__(self):
18
+ self.semantic_similarity_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased')
19
+ self.perplexity_model_name = "unicamp-dl/ptt5-base-portuguese-vocab"
20
+ self.perplexity_tokenizer = AutoTokenizer.from_pretrained(self.perplexity_model_name)
21
+
22
+ # Load Hugging Face metrics
23
+ self.bertscore_metric = load_metric("bertscore")
24
+ self.bleu_metric = load_metric("bleu")
25
+ self.rouge_metric = load_metric("rouge")
26
+ self.meteor_metric = load_metric("meteor")
27
+ self.sacrebleu_metric = load_metric("sacrebleu") # More robust BLEU implementation
28
+
29
+ # Load a powerful LLM for judging content and detecting hallucinations
30
+ self.judge_model_name = "gpt-3.5-turbo" # Gemini Or GPT-4 if available
31
+ self.judge = pipeline("text-generation", model=self.judge_model_name)
32
+
33
+
34
+ def calculate_perplexity(self, text):
35
+ """
36
+ Calculates the perplexity of a text using a Portuguese LLM model.
37
+
38
+ Perplexity measures how well the language model understands the text.
39
+ Lower perplexity indicates that the text is more predictable and likely to be grammatically correct.
40
+ Higher perplexity suggests the text is more surprising or unusual, potentially indicating errors or nonsensical content.
41
+ """
42
+ try:
43
+ perplexity_model = AutoModelForCausalLM.from_pretrained(self.perplexity_model_name)
44
+ with torch.no_grad():
45
+ tokenize_input = self.perplexity_tokenizer.tokenize(text)
46
+ tensor_input = self.perplexity_tokenizer.encode(text, return_tensors='pt')
47
+ loss = perplexity_model(tensor_input, labels=tensor_input)[0]
48
+ return torch.exp(loss).item()
49
+ except Exception as e:
50
+ print(f"Error calculating perplexity: {e}")
51
+ return float('inf')
52
+
53
+
54
+ def detect_hallucination_with_llm(self, text, window_size=200):
55
+ """
56
+ Detects potential hallucinations using an LLM with a refined prompt.
57
+ """
58
+ hallucinations = []
59
+ text_chunks = nltk.word_tokenize(text)
60
+
61
+ for i in range(0, len(text_chunks), window_size):
62
+ chunk = " ".join(text_chunks[i:i + window_size])
63
+
64
+ prompt = f"""
65
+ You are an expert in identifying factual errors and inconsistencies in educational text.
66
+ Your task is to meticulously analyze the provided text excerpt and pinpoint any potential hallucinations.
67
+
68
+ Focus on identifying claims or statements that exhibit the following characteristics:
69
+
70
+ * **Factual Inaccuracy:** Assertions that are demonstrably false or lack credible supporting evidence.
71
+ * **Logical Fallacies:** Statements containing flawed reasoning or internal contradictions.
72
+ * **Nonsensical Claims:** Assertions that are absurd, meaningless, or defy common sense.
73
+ * **Invented Information:** Fabricated details or events that have no basis in reality.
74
+
75
+ Text Excerpt:
76
+ ```
77
+ {chunk}
78
+ ```
79
+
80
+ For each potential hallucination, provide:
81
+ - **Hallucination:** The specific text you believe is a hallucination.
82
+ - **Explanation:** A detailed and precise justification for why you classify it as a hallucination.
83
+
84
+ Return your analysis as a JSON list of dictionaries, strictly adhering to the following format:
85
+
86
+ ```json
87
+ [
88
+ {{"hallucination": "[The hallucinated text]", "explanation": "[Your detailed explanation]"}}
89
+ ]
90
+ ```
91
+ """
92
+ response = self.judge(prompt, max_length=300)[0]['generated_text'].strip()
93
+
94
+ try:
95
+ chunk_hallucinations = eval(response)
96
+ for hallucination in chunk_hallucinations:
97
+ hallucinations.append({
98
+ 'chunk': chunk,
99
+ 'hallucination': hallucination['hallucination'],
100
+ 'explanation': hallucination['explanation']
101
+ })
102
+ except Exception as e:
103
+ print(f"Error parsing LLM response: {e}")
104
+ print(f"LLM Response: {response}")
105
+
106
+ return hallucinations
107
+
108
+ def calculate_metrics(self, generated_text, reference_text):
109
+ """Calculates BERTScore, BLEU, ROUGE, METEOR, and SacreBLEU metrics."""
110
+ results = {}
111
+ try:
112
+ results['bertscore'] = self.bertscore_metric.compute(predictions=[generated_text], references=[reference_text], lang="pt")['f1'][0]
113
+ bleu_results = self.bleu_metric.compute(predictions=[generated_text.split()], references=[[reference_text.split()]])
114
+ results['bleu'] = bleu_results['bleu']
115
+ rouge_results = self.rouge_metric.compute(predictions=[generated_text], references=[reference_text])
116
+ results['rougeL'] = rouge_results['rougeL']
117
+ meteor_results = self.meteor_metric.compute(predictions=[generated_text], references=[reference_text])
118
+ results['meteor'] = meteor_results['meteor']
119
+
120
+ # SacreBLEU (more robust BLEU implementation)
121
+ sacrebleu_results = self.sacrebleu_metric.compute(predictions=[generated_text], references=[[reference_text]])
122
+ results['sacrebleu'] = sacrebleu_results['score']
123
+
124
+ except Exception as e:
125
+ print(f"Error calculating metrics: {e}")
126
+ results = {'bertscore': None, 'bleu': None, 'rougeL': None, 'meteor': None, 'sacrebleu': None}
127
+ return results
128
+
129
+ def analyze_text(self, text, perplexity_threshold=40):
130
+ """
131
+ Analyzes a text for perplexity and potential hallucinations.
132
+ """
133
+ results = []
134
+ sentences = nltk.sent_tokenize(text)
135
+
136
+ for i, sentence in enumerate(sentences):
137
+ perplexity = self.calculate_perplexity(sentence)
138
+ hallucinations = self.detect_hallucination_with_llm(sentence)
139
+
140
+ issues = []
141
+ if perplexity > perplexity_threshold:
142
+ issues.append(f"- **High Perplexity:** ({perplexity:.2f}) The sentence might be grammatically incorrect or nonsensical.")
143
+ if hallucinations:
144
+ for hallucination in hallucinations:
145
+ issues.append(f"- **Potential Hallucination (LLM):** {hallucination['hallucination']} - {hallucination['explanation']}")
146
+
147
+ review_flag = len(issues) > 0
148
+ explanation = "\n".join(issues) if issues else "No potential issues detected."
149
+
150
+ results.append({
151
+ 'sentence_index': i,
152
+ 'review_flag': review_flag,
153
+ 'explanation': explanation,
154
+ 'perplexity': perplexity,
155
+ 'hallucinations': hallucinations,
156
+ 'sentence': sentence
157
+ })
158
+
159
+ return results
160
+
161
+ def analyze_content_for_review(self, generated_text, reference_text,
162
+ similarity_threshold,
163
+ bertscore_threshold,
164
+ bleu_threshold,
165
+ rouge_threshold,
166
+ meteor_threshold):
167
+ """Analyzes content and flags potential issues based on provided thresholds and LLM judgment."""
168
+ similarity = self.estimate_semantic_similarity(generated_text, reference_text)
169
+ metrics = self.calculate_metrics(generated_text, reference_text)
170
+ llm_judgment = self.get_llm_judgment(generated_text, reference_text)
171
+
172
+ issues = []
173
+ if similarity < similarity_threshold:
174
+ issues.append(f"- **Low Semantic Similarity:** ({similarity:.2f}) Content might be off-topic or not factually aligned.")
175
+ if metrics['bertscore'] and metrics['bertscore'] < bertscore_threshold:
176
+ issues.append(f"- **Low BERTScore:** ({metrics['bertscore']:.2f}) There might be factual inaccuracies or significant paraphrasing.")
177
+ if metrics['bleu'] and metrics['bleu'] < bleu_threshold:
178
+ issues.append(f"- **Low BLEU Score:** ({metrics['bleu']:.2f}) The generated text might not be fluent or use appropriate wording.")
179
+ if metrics['rougeL'] and metrics['rougeL'] < rouge_threshold:
180
+ issues.append(f"- **Low ROUGE-L Score:** ({metrics['rougeL']:.2f}) The generated text might not cover important information from the reference.")
181
+ if metrics['meteor'] and metrics['meteor'] < meteor_threshold:
182
+ issues.append(f"- **Low METEOR Score:** ({metrics['meteor']:.2f}) The generated text might have poor word alignment with the reference.")
183
+
184
+ # Use LLM judgment as the primary decision-maker
185
+ if llm_judgment == "major issues":
186
+ review_flag = True
187
+ explanation = f"LLM Judgment: **Major Issues**\n" + "\n".join(issues)
188
+ elif llm_judgment == "minor issues":
189
+ review_flag = True
190
+ explanation = f"LLM Judgment: **Minor Issues**\n" + "\n".join(issues)
191
+ else:
192
+ review_flag = False
193
+ explanation = "LLM Judgment: **No Issues**"
194
+
195
+ return {
196
+ 'review_flag': review_flag,
197
+ 'explanation': explanation,
198
+ 'semantic_similarity': similarity,
199
+ 'metrics': metrics,
200
+ 'llm_judgment': llm_judgment,
201
+ 'generated_text': generated_text,
202
+ 'reference_text': reference_text
203
+ }
204
+
205
+ # --- Example Usage ---
206
+ if __name__ == "__main__":
207
+ evaluator = ContentEvaluator()
208
+
209
+ # Example text (replace with your actual data)
210
+ text = """
211
+ A Terra é plana e o Sol gira em torno dela.
212
+ A gravidade é uma força fraca.
213
+ As plantas precisam de água para sobreviver.
214
+ A Lua é feita de queijo.
215
+ Os dinossauros ainda vivem na Amazônia.
216
+ """
217
+
218
+ analysis_results = evaluator.analyze_text(text)
219
+
220
+ for result in analysis_results:
221
+ print(f"----- Sentence {result['sentence_index'] + 1} -----")
222
+ print(f"Review Flag: {result['review_flag']}")
223
+ print(f"Explanation: {result['explanation']}")
224
+ print(f"Perplexity: {result['perplexity']:.2f}")
225
+ print(f"Sentence: {result['sentence']}\n")
226
+
227
+ # 2. Content Evaluation Phase (using the best thresholds)
228
+ new_generated_text = evaluator.generate_educational_content("Matemática")
229
+ new_reference_text = "Content from your educational material..."
230
+
231
+ evaluation_result = evaluator.analyze_content_for_review(
232
+ new_generated_text, new_reference_text,
233
+ best_thresholds['similarity_threshold'],
234
+ best_thresholds['bertscore_threshold'],
235
+ best_thresholds['bleu_threshold'],
236
+ best_thresholds['rouge_threshold'],
237
+ best_thresholds['meteor_threshold']
238
+ )
239
+
240
+ print("\n----- Evaluation Result -----")
241
+ print(f"Review Flag: {evaluation_result['review_flag']}")
242
+ print(f"Explanation: {evaluation_result['explanation']}")