Spaces:
No application file
No application file
Create metrics.py
Browse files- metrics.py +247 -0
metrics.py
ADDED
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
import mlflow
|
3 |
+
import hyperopt
|
4 |
+
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
|
5 |
+
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
|
6 |
+
import torch
|
7 |
+
from sentence_transformers import SentenceTransformer, util
|
8 |
+
from bert_score import score
|
9 |
+
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
|
10 |
+
from rouge import Rouge
|
11 |
+
from tqdm import tqdm
|
12 |
+
from datasets import load_metric
|
13 |
+
|
14 |
+
# Download necessary NLTK data
|
15 |
+
nltk.download('punkt')
|
16 |
+
nltk.download('stopwords')
|
17 |
+
|
18 |
+
# --- Load pre-trained models ---
|
19 |
+
# Research and update these with the most recent and powerful Portuguese models
|
20 |
+
semantic_similarity_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased')
|
21 |
+
perplexity_model_name = "unicamp-dl/ptt5-base-portuguese-vocab" # Example: More recent GPT-like model
|
22 |
+
perplexity_model = AutoModelForCausalLM.from_pretrained(perplexity_model_name)
|
23 |
+
perplexity_tokenizer = AutoTokenizer.from_pretrained(perplexity_model_name)
|
24 |
+
|
25 |
+
# Load Hugging Face metrics
|
26 |
+
bertscore_metric = load_metric("bertscore")
|
27 |
+
bleu_metric = load_metric("bleu")
|
28 |
+
rouge_metric = load_metric("rouge")
|
29 |
+
meteor_metric = load_metric("meteor") # Additional metric
|
30 |
+
|
31 |
+
# Load a powerful LLM for generating and judging content
|
32 |
+
generator_model_name = "gpt-3.5-turbo" # Or GPT-4 or Gemini if available
|
33 |
+
generator = pipeline("text-generation", model=generator_model_name)
|
34 |
+
judge_model_name = generator_model_name # Using the same model for judging
|
35 |
+
judge = pipeline("text-generation", model=judge_model_name)
|
36 |
+
|
37 |
+
# --- Helper Functions ---
|
38 |
+
def calculate_perplexity(text):
|
39 |
+
"""Calculates perplexity of text using a Portuguese LLM model."""
|
40 |
+
try:
|
41 |
+
with torch.no_grad():
|
42 |
+
tokenize_input = perplexity_tokenizer.tokenize(text)
|
43 |
+
tensor_input = perplexity_tokenizer.encode(text, return_tensors='pt')
|
44 |
+
loss = perplexity_model(tensor_input, labels=tensor_input)[0]
|
45 |
+
return torch.exp(loss).item()
|
46 |
+
except Exception as e:
|
47 |
+
print(f"Error calculating perplexity: {e}")
|
48 |
+
return float('inf')
|
49 |
+
|
50 |
+
|
51 |
+
def estimate_semantic_similarity(generated_text, reference_text):
|
52 |
+
"""Estimates semantic similarity using a Portuguese Sentence Transformer."""
|
53 |
+
try:
|
54 |
+
embedding1 = semantic_similarity_model.encode(generated_text, convert_to_tensor=True)
|
55 |
+
embedding2 = semantic_similarity_model.encode(reference_text, convert_to_tensor=True)
|
56 |
+
cosine_sim = util.pytorch_cos_sim(embedding1, embedding2)
|
57 |
+
return cosine_sim.item()
|
58 |
+
except Exception as e:
|
59 |
+
print(f"Error calculating semantic similarity: {e}")
|
60 |
+
return 0.0
|
61 |
+
|
62 |
+
|
63 |
+
def calculate_metrics(generated_text, reference_text):
|
64 |
+
"""Calculates BERTScore, BLEU, ROUGE, and METEOR metrics."""
|
65 |
+
results = {}
|
66 |
+
try:
|
67 |
+
results['bertscore'] = bertscore_metric.compute(predictions=[generated_text], references=[reference_text], lang="pt")['f1'][0]
|
68 |
+
except Exception as e:
|
69 |
+
print(f"Error calculating BERTScore: {e}")
|
70 |
+
results['bertscore'] = None
|
71 |
+
|
72 |
+
try:
|
73 |
+
bleu_results = bleu_metric.compute(predictions=[generated_text.split()], references=[[reference_text.split()]])
|
74 |
+
results['bleu'] = bleu_results['bleu']
|
75 |
+
except Exception as e:
|
76 |
+
print(f"Error calculating BLEU: {e}")
|
77 |
+
results['bleu'] = None
|
78 |
+
|
79 |
+
try:
|
80 |
+
rouge_results = rouge_metric.compute(predictions=[generated_text], references=[reference_text])
|
81 |
+
results['rougeL'] = rouge_results['rougeL']
|
82 |
+
except Exception as e:
|
83 |
+
print(f"Error calculating ROUGE: {e}")
|
84 |
+
results['rougeL'] = None
|
85 |
+
|
86 |
+
try:
|
87 |
+
meteor_results = meteor_metric.compute(predictions=[generated_text], references=[reference_text])
|
88 |
+
results['meteor'] = meteor_results['meteor']
|
89 |
+
except Exception as e:
|
90 |
+
print(f"Error calculating METEOR: {e}")
|
91 |
+
results['meteor'] = None
|
92 |
+
|
93 |
+
return results
|
94 |
+
|
95 |
+
|
96 |
+
def get_llm_judgment(generated_text, reference_text):
|
97 |
+
"""Gets a judgment from a powerful LLM on the quality of the generated text."""
|
98 |
+
prompt = f"""
|
99 |
+
You are an expert in evaluating educational content.
|
100 |
+
Please evaluate the following generated text based on its accuracy, relevance, and clarity,
|
101 |
+
compared to the provided reference text.
|
102 |
+
|
103 |
+
Reference Text:
|
104 |
+
{reference_text}
|
105 |
+
|
106 |
+
Generated Text:
|
107 |
+
{generated_text}
|
108 |
+
|
109 |
+
Provide your judgment as one of the following categories:
|
110 |
+
- "no issues": The generated text is accurate, relevant, and clear.
|
111 |
+
- "minor issues": The generated text has some minor issues, but is mostly acceptable.
|
112 |
+
- "major issues": The generated text has significant issues and needs substantial revision.
|
113 |
+
"""
|
114 |
+
judgment = judge(prompt, max_length=50)[0]['generated_text'].strip()
|
115 |
+
return judgment
|
116 |
+
|
117 |
+
|
118 |
+
# --- Content Analysis Function ---
|
119 |
+
def analyze_content_for_review(generated_text, reference_text,
|
120 |
+
similarity_threshold,
|
121 |
+
bertscore_threshold,
|
122 |
+
bleu_threshold,
|
123 |
+
rouge_threshold,
|
124 |
+
meteor_threshold):
|
125 |
+
"""Analyzes content and flags potential issues based on provided thresholds and LLM judgment."""
|
126 |
+
similarity = estimate_semantic_similarity(generated_text, reference_text)
|
127 |
+
metrics = calculate_metrics(generated_text, reference_text)
|
128 |
+
llm_judgment = get_llm_judgment(generated_text, reference_text)
|
129 |
+
|
130 |
+
issues = []
|
131 |
+
if similarity < similarity_threshold:
|
132 |
+
issues.append(f"- **Low Semantic Similarity:** ({similarity:.2f}) Content might be off-topic or not factually aligned.")
|
133 |
+
if metrics['bertscore'] and metrics['bertscore'] < bertscore_threshold:
|
134 |
+
issues.append(f"- **Low BERTScore:** ({metrics['bertscore']:.2f}) There might be factual inaccuracies or significant paraphrasing.")
|
135 |
+
if metrics['bleu'] and metrics['bleu'] < bleu_threshold:
|
136 |
+
issues.append(f"- **Low BLEU Score:** ({metrics['bleu']:.2f}) The generated text might not be fluent or use appropriate wording.")
|
137 |
+
if metrics['rougeL'] and metrics['rougeL'] < rouge_threshold:
|
138 |
+
issues.append(f"- **Low ROUGE-L Score:** ({metrics['rougeL']:.2f}) The generated text might not cover important information from the reference.")
|
139 |
+
if metrics['meteor'] and metrics['meteor'] < meteor_threshold:
|
140 |
+
issues.append(f"- **Low METEOR Score:** ({metrics['meteor']:.2f}) The generated text might have poor word alignment with the reference.")
|
141 |
+
|
142 |
+
# Use LLM judgment as the primary decision-maker
|
143 |
+
if llm_judgment == "major issues":
|
144 |
+
review_flag = True
|
145 |
+
explanation = f"LLM Judgment: **Major Issues**\n" + "\n".join(issues)
|
146 |
+
elif llm_judgment == "minor issues":
|
147 |
+
review_flag = True
|
148 |
+
explanation = f"LLM Judgment: **Minor Issues**\n" + "\n".join(issues)
|
149 |
+
else:
|
150 |
+
review_flag = False
|
151 |
+
explanation = "LLM Judgment: **No Issues**"
|
152 |
+
|
153 |
+
return {
|
154 |
+
'review_flag': review_flag,
|
155 |
+
'explanation': explanation,
|
156 |
+
'semantic_similarity': similarity,
|
157 |
+
'metrics': metrics,
|
158 |
+
'llm_judgment': llm_judgment,
|
159 |
+
'generated_text': generated_text,
|
160 |
+
'reference_text': reference_text
|
161 |
+
}
|
162 |
+
|
163 |
+
|
164 |
+
# --- Threshold Optimization Functions ---
|
165 |
+
def generate_educational_content(topic, num_sections=3):
|
166 |
+
"""Generates educational content with chapters, topics, sections, and subsections."""
|
167 |
+
prompt = f"""
|
168 |
+
Generate a chapter of educational content on the topic of "{topic}".
|
169 |
+
The chapter should include {num_sections} sections, each with at least
|
170 |
+
one subsection. The content should be factually accurate, well-organized,
|
171 |
+
and written in clear and concise Portuguese.
|
172 |
+
"""
|
173 |
+
generated_content = generator(prompt, max_length=1000)[0]['generated_text']
|
174 |
+
return generated_content
|
175 |
+
|
176 |
+
def objective(params):
|
177 |
+
"""Objective function for Hyperopt to minimize."""
|
178 |
+
similarity_threshold = params['similarity_threshold']
|
179 |
+
bertscore_threshold = params['bertscore_threshold']
|
180 |
+
bleu_threshold = params['bleu_threshold']
|
181 |
+
rouge_threshold = params['rouge_threshold']
|
182 |
+
meteor_threshold = params['meteor_threshold']
|
183 |
+
|
184 |
+
# Generate AI-created data
|
185 |
+
topics = ["Astronomia", "Biologia", "História", "Matemática", "Física", "Química"] # More topics
|
186 |
+
generated_texts = []
|
187 |
+
reference_texts = []
|
188 |
+
for topic in topics:
|
189 |
+
reference_text = generate_educational_content(topic)
|
190 |
+
generated_text = generate_educational_content(topic)
|
191 |
+
generated_texts.append(generated_text)
|
192 |
+
reference_texts.append(reference_text)
|
193 |
+
|
194 |
+
total_errors = 0
|
195 |
+
for gen_text, ref_text in zip(generated_texts, reference_texts):
|
196 |
+
result = analyze_content_for_review(gen_text, ref_text,
|
197 |
+
similarity_threshold,
|
198 |
+
bertscore_threshold,
|
199 |
+
bleu_threshold,
|
200 |
+
rouge_threshold,
|
201 |
+
meteor_threshold)
|
202 |
+
if result['review_flag'] and result['llm_judgment'] == "no issues":
|
203 |
+
total_errors += 1
|
204 |
+
|
205 |
+
# Log metrics and parameters to MLflow
|
206 |
+
with mlflow.start_run():
|
207 |
+
mlflow.log_params(params)
|
208 |
+
mlflow.log_metric("total_errors", total_errors)
|
209 |
+
|
210 |
+
return {'loss': total_errors, 'status': STATUS_OK}
|
211 |
+
|
212 |
+
|
213 |
+
# --- Main Execution ---
|
214 |
+
if __name__ == "__main__":
|
215 |
+
# 1. Threshold Optimization Phase
|
216 |
+
mlflow.set_tracking_uri("http://localhost:5000") # Or your MLflow server URI
|
217 |
+
search_space = { # Hyperparameter search space
|
218 |
+
'similarity_threshold': hp.uniform('similarity_threshold', 0.5, 0.9),
|
219 |
+
'bertscore_threshold': hp.uniform('bertscore_threshold', 0.7, 0.95),
|
220 |
+
'bleu_threshold': hp.uniform('bleu_threshold', 0.4, 0.8),
|
221 |
+
'rouge_threshold': hp.uniform('rouge_threshold', 0.4, 0.7),
|
222 |
+
'meteor_threshold': hp.uniform('meteor_threshold', 0.3, 0.7)
|
223 |
+
}
|
224 |
+
trials = Trials()
|
225 |
+
best_thresholds = fmin(fn=objective,
|
226 |
+
space=search_space,
|
227 |
+
algo=tpe.suggest,
|
228 |
+
max_evals=50, # Adjust the number of evaluations as needed
|
229 |
+
trials=trials)
|
230 |
+
print("Best thresholds found:", best_thresholds)
|
231 |
+
|
232 |
+
# 2. Content Evaluation Phase (using the best thresholds)
|
233 |
+
new_generated_text = generate_educational_content("Matemática") # Example
|
234 |
+
new_reference_text = "Content from your educational material..."
|
235 |
+
|
236 |
+
evaluation_result = analyze_content_for_review(
|
237 |
+
new_generated_text, new_reference_text,
|
238 |
+
best_thresholds['similarity_threshold'],
|
239 |
+
best_thresholds['bertscore_threshold'],
|
240 |
+
best_thresholds['bleu_threshold'],
|
241 |
+
best_thresholds['rouge_threshold'],
|
242 |
+
best_thresholds['meteor_threshold']
|
243 |
+
)
|
244 |
+
|
245 |
+
print("\n----- Evaluation Result -----")
|
246 |
+
print(f"Review Flag: {evaluation_result['review_flag']}")
|
247 |
+
print(f"Explanation: {evaluation_result['explanation']}")
|