Spaces:
No application file
No application file
Update metrics_v2.py
Browse files- metrics_v2.py +242 -0
metrics_v2.py
CHANGED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
|
3 |
+
import torch
|
4 |
+
from sentence_transformers import SentenceTransformer, util
|
5 |
+
from bert_score import score
|
6 |
+
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
7 |
+
from rouge import Rouge
|
8 |
+
from tqdm import tqdm
|
9 |
+
from datasets import load_metric
|
10 |
+
|
11 |
+
# Download necessary NLTK data
|
12 |
+
nltk.download('punkt')
|
13 |
+
nltk.download('stopwords')
|
14 |
+
|
15 |
+
# --- Model and Metric Loading ---
|
16 |
+
class ContentEvaluator:
|
17 |
+
def __init__(self):
|
18 |
+
self.semantic_similarity_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased')
|
19 |
+
self.perplexity_model_name = "unicamp-dl/ptt5-base-portuguese-vocab"
|
20 |
+
self.perplexity_tokenizer = AutoTokenizer.from_pretrained(self.perplexity_model_name)
|
21 |
+
|
22 |
+
# Load Hugging Face metrics
|
23 |
+
self.bertscore_metric = load_metric("bertscore")
|
24 |
+
self.bleu_metric = load_metric("bleu")
|
25 |
+
self.rouge_metric = load_metric("rouge")
|
26 |
+
self.meteor_metric = load_metric("meteor")
|
27 |
+
self.sacrebleu_metric = load_metric("sacrebleu") # More robust BLEU implementation
|
28 |
+
|
29 |
+
# Load a powerful LLM for judging content and detecting hallucinations
|
30 |
+
self.judge_model_name = "gpt-3.5-turbo" # Gemini Or GPT-4 if available
|
31 |
+
self.judge = pipeline("text-generation", model=self.judge_model_name)
|
32 |
+
|
33 |
+
|
34 |
+
def calculate_perplexity(self, text):
|
35 |
+
"""
|
36 |
+
Calculates the perplexity of a text using a Portuguese LLM model.
|
37 |
+
|
38 |
+
Perplexity measures how well the language model understands the text.
|
39 |
+
Lower perplexity indicates that the text is more predictable and likely to be grammatically correct.
|
40 |
+
Higher perplexity suggests the text is more surprising or unusual, potentially indicating errors or nonsensical content.
|
41 |
+
"""
|
42 |
+
try:
|
43 |
+
perplexity_model = AutoModelForCausalLM.from_pretrained(self.perplexity_model_name)
|
44 |
+
with torch.no_grad():
|
45 |
+
tokenize_input = self.perplexity_tokenizer.tokenize(text)
|
46 |
+
tensor_input = self.perplexity_tokenizer.encode(text, return_tensors='pt')
|
47 |
+
loss = perplexity_model(tensor_input, labels=tensor_input)[0]
|
48 |
+
return torch.exp(loss).item()
|
49 |
+
except Exception as e:
|
50 |
+
print(f"Error calculating perplexity: {e}")
|
51 |
+
return float('inf')
|
52 |
+
|
53 |
+
|
54 |
+
def detect_hallucination_with_llm(self, text, window_size=200):
|
55 |
+
"""
|
56 |
+
Detects potential hallucinations using an LLM with a refined prompt.
|
57 |
+
"""
|
58 |
+
hallucinations = []
|
59 |
+
text_chunks = nltk.word_tokenize(text)
|
60 |
+
|
61 |
+
for i in range(0, len(text_chunks), window_size):
|
62 |
+
chunk = " ".join(text_chunks[i:i + window_size])
|
63 |
+
|
64 |
+
prompt = f"""
|
65 |
+
You are an expert in identifying factual errors and inconsistencies in educational text.
|
66 |
+
Your task is to meticulously analyze the provided text excerpt and pinpoint any potential hallucinations.
|
67 |
+
|
68 |
+
Focus on identifying claims or statements that exhibit the following characteristics:
|
69 |
+
|
70 |
+
* **Factual Inaccuracy:** Assertions that are demonstrably false or lack credible supporting evidence.
|
71 |
+
* **Logical Fallacies:** Statements containing flawed reasoning or internal contradictions.
|
72 |
+
* **Nonsensical Claims:** Assertions that are absurd, meaningless, or defy common sense.
|
73 |
+
* **Invented Information:** Fabricated details or events that have no basis in reality.
|
74 |
+
|
75 |
+
Text Excerpt:
|
76 |
+
```
|
77 |
+
{chunk}
|
78 |
+
```
|
79 |
+
|
80 |
+
For each potential hallucination, provide:
|
81 |
+
- **Hallucination:** The specific text you believe is a hallucination.
|
82 |
+
- **Explanation:** A detailed and precise justification for why you classify it as a hallucination.
|
83 |
+
|
84 |
+
Return your analysis as a JSON list of dictionaries, strictly adhering to the following format:
|
85 |
+
|
86 |
+
```json
|
87 |
+
[
|
88 |
+
{{"hallucination": "[The hallucinated text]", "explanation": "[Your detailed explanation]"}}
|
89 |
+
]
|
90 |
+
```
|
91 |
+
"""
|
92 |
+
response = self.judge(prompt, max_length=300)[0]['generated_text'].strip()
|
93 |
+
|
94 |
+
try:
|
95 |
+
chunk_hallucinations = eval(response)
|
96 |
+
for hallucination in chunk_hallucinations:
|
97 |
+
hallucinations.append({
|
98 |
+
'chunk': chunk,
|
99 |
+
'hallucination': hallucination['hallucination'],
|
100 |
+
'explanation': hallucination['explanation']
|
101 |
+
})
|
102 |
+
except Exception as e:
|
103 |
+
print(f"Error parsing LLM response: {e}")
|
104 |
+
print(f"LLM Response: {response}")
|
105 |
+
|
106 |
+
return hallucinations
|
107 |
+
|
108 |
+
def calculate_metrics(self, generated_text, reference_text):
|
109 |
+
"""Calculates BERTScore, BLEU, ROUGE, METEOR, and SacreBLEU metrics."""
|
110 |
+
results = {}
|
111 |
+
try:
|
112 |
+
results['bertscore'] = self.bertscore_metric.compute(predictions=[generated_text], references=[reference_text], lang="pt")['f1'][0]
|
113 |
+
bleu_results = self.bleu_metric.compute(predictions=[generated_text.split()], references=[[reference_text.split()]])
|
114 |
+
results['bleu'] = bleu_results['bleu']
|
115 |
+
rouge_results = self.rouge_metric.compute(predictions=[generated_text], references=[reference_text])
|
116 |
+
results['rougeL'] = rouge_results['rougeL']
|
117 |
+
meteor_results = self.meteor_metric.compute(predictions=[generated_text], references=[reference_text])
|
118 |
+
results['meteor'] = meteor_results['meteor']
|
119 |
+
|
120 |
+
# SacreBLEU (more robust BLEU implementation)
|
121 |
+
sacrebleu_results = self.sacrebleu_metric.compute(predictions=[generated_text], references=[[reference_text]])
|
122 |
+
results['sacrebleu'] = sacrebleu_results['score']
|
123 |
+
|
124 |
+
except Exception as e:
|
125 |
+
print(f"Error calculating metrics: {e}")
|
126 |
+
results = {'bertscore': None, 'bleu': None, 'rougeL': None, 'meteor': None, 'sacrebleu': None}
|
127 |
+
return results
|
128 |
+
|
129 |
+
def analyze_text(self, text, perplexity_threshold=40):
|
130 |
+
"""
|
131 |
+
Analyzes a text for perplexity and potential hallucinations.
|
132 |
+
"""
|
133 |
+
results = []
|
134 |
+
sentences = nltk.sent_tokenize(text)
|
135 |
+
|
136 |
+
for i, sentence in enumerate(sentences):
|
137 |
+
perplexity = self.calculate_perplexity(sentence)
|
138 |
+
hallucinations = self.detect_hallucination_with_llm(sentence)
|
139 |
+
|
140 |
+
issues = []
|
141 |
+
if perplexity > perplexity_threshold:
|
142 |
+
issues.append(f"- **High Perplexity:** ({perplexity:.2f}) The sentence might be grammatically incorrect or nonsensical.")
|
143 |
+
if hallucinations:
|
144 |
+
for hallucination in hallucinations:
|
145 |
+
issues.append(f"- **Potential Hallucination (LLM):** {hallucination['hallucination']} - {hallucination['explanation']}")
|
146 |
+
|
147 |
+
review_flag = len(issues) > 0
|
148 |
+
explanation = "\n".join(issues) if issues else "No potential issues detected."
|
149 |
+
|
150 |
+
results.append({
|
151 |
+
'sentence_index': i,
|
152 |
+
'review_flag': review_flag,
|
153 |
+
'explanation': explanation,
|
154 |
+
'perplexity': perplexity,
|
155 |
+
'hallucinations': hallucinations,
|
156 |
+
'sentence': sentence
|
157 |
+
})
|
158 |
+
|
159 |
+
return results
|
160 |
+
|
161 |
+
def analyze_content_for_review(self, generated_text, reference_text,
|
162 |
+
similarity_threshold,
|
163 |
+
bertscore_threshold,
|
164 |
+
bleu_threshold,
|
165 |
+
rouge_threshold,
|
166 |
+
meteor_threshold):
|
167 |
+
"""Analyzes content and flags potential issues based on provided thresholds and LLM judgment."""
|
168 |
+
similarity = self.estimate_semantic_similarity(generated_text, reference_text)
|
169 |
+
metrics = self.calculate_metrics(generated_text, reference_text)
|
170 |
+
llm_judgment = self.get_llm_judgment(generated_text, reference_text)
|
171 |
+
|
172 |
+
issues = []
|
173 |
+
if similarity < similarity_threshold:
|
174 |
+
issues.append(f"- **Low Semantic Similarity:** ({similarity:.2f}) Content might be off-topic or not factually aligned.")
|
175 |
+
if metrics['bertscore'] and metrics['bertscore'] < bertscore_threshold:
|
176 |
+
issues.append(f"- **Low BERTScore:** ({metrics['bertscore']:.2f}) There might be factual inaccuracies or significant paraphrasing.")
|
177 |
+
if metrics['bleu'] and metrics['bleu'] < bleu_threshold:
|
178 |
+
issues.append(f"- **Low BLEU Score:** ({metrics['bleu']:.2f}) The generated text might not be fluent or use appropriate wording.")
|
179 |
+
if metrics['rougeL'] and metrics['rougeL'] < rouge_threshold:
|
180 |
+
issues.append(f"- **Low ROUGE-L Score:** ({metrics['rougeL']:.2f}) The generated text might not cover important information from the reference.")
|
181 |
+
if metrics['meteor'] and metrics['meteor'] < meteor_threshold:
|
182 |
+
issues.append(f"- **Low METEOR Score:** ({metrics['meteor']:.2f}) The generated text might have poor word alignment with the reference.")
|
183 |
+
|
184 |
+
# Use LLM judgment as the primary decision-maker
|
185 |
+
if llm_judgment == "major issues":
|
186 |
+
review_flag = True
|
187 |
+
explanation = f"LLM Judgment: **Major Issues**\n" + "\n".join(issues)
|
188 |
+
elif llm_judgment == "minor issues":
|
189 |
+
review_flag = True
|
190 |
+
explanation = f"LLM Judgment: **Minor Issues**\n" + "\n".join(issues)
|
191 |
+
else:
|
192 |
+
review_flag = False
|
193 |
+
explanation = "LLM Judgment: **No Issues**"
|
194 |
+
|
195 |
+
return {
|
196 |
+
'review_flag': review_flag,
|
197 |
+
'explanation': explanation,
|
198 |
+
'semantic_similarity': similarity,
|
199 |
+
'metrics': metrics,
|
200 |
+
'llm_judgment': llm_judgment,
|
201 |
+
'generated_text': generated_text,
|
202 |
+
'reference_text': reference_text
|
203 |
+
}
|
204 |
+
|
205 |
+
# --- Example Usage ---
|
206 |
+
if __name__ == "__main__":
|
207 |
+
evaluator = ContentEvaluator()
|
208 |
+
|
209 |
+
# Example text (replace with your actual data)
|
210 |
+
text = """
|
211 |
+
A Terra é plana e o Sol gira em torno dela.
|
212 |
+
A gravidade é uma força fraca.
|
213 |
+
As plantas precisam de água para sobreviver.
|
214 |
+
A Lua é feita de queijo.
|
215 |
+
Os dinossauros ainda vivem na Amazônia.
|
216 |
+
"""
|
217 |
+
|
218 |
+
analysis_results = evaluator.analyze_text(text)
|
219 |
+
|
220 |
+
for result in analysis_results:
|
221 |
+
print(f"----- Sentence {result['sentence_index'] + 1} -----")
|
222 |
+
print(f"Review Flag: {result['review_flag']}")
|
223 |
+
print(f"Explanation: {result['explanation']}")
|
224 |
+
print(f"Perplexity: {result['perplexity']:.2f}")
|
225 |
+
print(f"Sentence: {result['sentence']}\n")
|
226 |
+
|
227 |
+
# 2. Content Evaluation Phase (using the best thresholds)
|
228 |
+
new_generated_text = evaluator.generate_educational_content("Matemática")
|
229 |
+
new_reference_text = "Content from your educational material..."
|
230 |
+
|
231 |
+
evaluation_result = evaluator.analyze_content_for_review(
|
232 |
+
new_generated_text, new_reference_text,
|
233 |
+
best_thresholds['similarity_threshold'],
|
234 |
+
best_thresholds['bertscore_threshold'],
|
235 |
+
best_thresholds['bleu_threshold'],
|
236 |
+
best_thresholds['rouge_threshold'],
|
237 |
+
best_thresholds['meteor_threshold']
|
238 |
+
)
|
239 |
+
|
240 |
+
print("\n----- Evaluation Result -----")
|
241 |
+
print(f"Review Flag: {evaluation_result['review_flag']}")
|
242 |
+
print(f"Explanation: {evaluation_result['explanation']}")
|