Concepta commited on
Commit
3d97611
1 Parent(s): d267675

Create metrics.py

Browse files
Files changed (1) hide show
  1. metrics.py +247 -0
metrics.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import mlflow
3
+ import hyperopt
4
+ from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
5
+ from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
6
+ import torch
7
+ from sentence_transformers import SentenceTransformer, util
8
+ from bert_score import score
9
+ from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
10
+ from rouge import Rouge
11
+ from tqdm import tqdm
12
+ from datasets import load_metric
13
+
14
+ # Download necessary NLTK data
15
+ nltk.download('punkt')
16
+ nltk.download('stopwords')
17
+
18
+ # --- Load pre-trained models ---
19
+ # Research and update these with the most recent and powerful Portuguese models
20
+ semantic_similarity_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased')
21
+ perplexity_model_name = "unicamp-dl/ptt5-base-portuguese-vocab" # Example: More recent GPT-like model
22
+ perplexity_model = AutoModelForCausalLM.from_pretrained(perplexity_model_name)
23
+ perplexity_tokenizer = AutoTokenizer.from_pretrained(perplexity_model_name)
24
+
25
+ # Load Hugging Face metrics
26
+ bertscore_metric = load_metric("bertscore")
27
+ bleu_metric = load_metric("bleu")
28
+ rouge_metric = load_metric("rouge")
29
+ meteor_metric = load_metric("meteor") # Additional metric
30
+
31
+ # Load a powerful LLM for generating and judging content
32
+ generator_model_name = "gpt-3.5-turbo" # Or GPT-4 or Gemini if available
33
+ generator = pipeline("text-generation", model=generator_model_name)
34
+ judge_model_name = generator_model_name # Using the same model for judging
35
+ judge = pipeline("text-generation", model=judge_model_name)
36
+
37
+ # --- Helper Functions ---
38
+ def calculate_perplexity(text):
39
+ """Calculates perplexity of text using a Portuguese LLM model."""
40
+ try:
41
+ with torch.no_grad():
42
+ tokenize_input = perplexity_tokenizer.tokenize(text)
43
+ tensor_input = perplexity_tokenizer.encode(text, return_tensors='pt')
44
+ loss = perplexity_model(tensor_input, labels=tensor_input)[0]
45
+ return torch.exp(loss).item()
46
+ except Exception as e:
47
+ print(f"Error calculating perplexity: {e}")
48
+ return float('inf')
49
+
50
+
51
+ def estimate_semantic_similarity(generated_text, reference_text):
52
+ """Estimates semantic similarity using a Portuguese Sentence Transformer."""
53
+ try:
54
+ embedding1 = semantic_similarity_model.encode(generated_text, convert_to_tensor=True)
55
+ embedding2 = semantic_similarity_model.encode(reference_text, convert_to_tensor=True)
56
+ cosine_sim = util.pytorch_cos_sim(embedding1, embedding2)
57
+ return cosine_sim.item()
58
+ except Exception as e:
59
+ print(f"Error calculating semantic similarity: {e}")
60
+ return 0.0
61
+
62
+
63
+ def calculate_metrics(generated_text, reference_text):
64
+ """Calculates BERTScore, BLEU, ROUGE, and METEOR metrics."""
65
+ results = {}
66
+ try:
67
+ results['bertscore'] = bertscore_metric.compute(predictions=[generated_text], references=[reference_text], lang="pt")['f1'][0]
68
+ except Exception as e:
69
+ print(f"Error calculating BERTScore: {e}")
70
+ results['bertscore'] = None
71
+
72
+ try:
73
+ bleu_results = bleu_metric.compute(predictions=[generated_text.split()], references=[[reference_text.split()]])
74
+ results['bleu'] = bleu_results['bleu']
75
+ except Exception as e:
76
+ print(f"Error calculating BLEU: {e}")
77
+ results['bleu'] = None
78
+
79
+ try:
80
+ rouge_results = rouge_metric.compute(predictions=[generated_text], references=[reference_text])
81
+ results['rougeL'] = rouge_results['rougeL']
82
+ except Exception as e:
83
+ print(f"Error calculating ROUGE: {e}")
84
+ results['rougeL'] = None
85
+
86
+ try:
87
+ meteor_results = meteor_metric.compute(predictions=[generated_text], references=[reference_text])
88
+ results['meteor'] = meteor_results['meteor']
89
+ except Exception as e:
90
+ print(f"Error calculating METEOR: {e}")
91
+ results['meteor'] = None
92
+
93
+ return results
94
+
95
+
96
+ def get_llm_judgment(generated_text, reference_text):
97
+ """Gets a judgment from a powerful LLM on the quality of the generated text."""
98
+ prompt = f"""
99
+ You are an expert in evaluating educational content.
100
+ Please evaluate the following generated text based on its accuracy, relevance, and clarity,
101
+ compared to the provided reference text.
102
+
103
+ Reference Text:
104
+ {reference_text}
105
+
106
+ Generated Text:
107
+ {generated_text}
108
+
109
+ Provide your judgment as one of the following categories:
110
+ - "no issues": The generated text is accurate, relevant, and clear.
111
+ - "minor issues": The generated text has some minor issues, but is mostly acceptable.
112
+ - "major issues": The generated text has significant issues and needs substantial revision.
113
+ """
114
+ judgment = judge(prompt, max_length=50)[0]['generated_text'].strip()
115
+ return judgment
116
+
117
+
118
+ # --- Content Analysis Function ---
119
+ def analyze_content_for_review(generated_text, reference_text,
120
+ similarity_threshold,
121
+ bertscore_threshold,
122
+ bleu_threshold,
123
+ rouge_threshold,
124
+ meteor_threshold):
125
+ """Analyzes content and flags potential issues based on provided thresholds and LLM judgment."""
126
+ similarity = estimate_semantic_similarity(generated_text, reference_text)
127
+ metrics = calculate_metrics(generated_text, reference_text)
128
+ llm_judgment = get_llm_judgment(generated_text, reference_text)
129
+
130
+ issues = []
131
+ if similarity < similarity_threshold:
132
+ issues.append(f"- **Low Semantic Similarity:** ({similarity:.2f}) Content might be off-topic or not factually aligned.")
133
+ if metrics['bertscore'] and metrics['bertscore'] < bertscore_threshold:
134
+ issues.append(f"- **Low BERTScore:** ({metrics['bertscore']:.2f}) There might be factual inaccuracies or significant paraphrasing.")
135
+ if metrics['bleu'] and metrics['bleu'] < bleu_threshold:
136
+ issues.append(f"- **Low BLEU Score:** ({metrics['bleu']:.2f}) The generated text might not be fluent or use appropriate wording.")
137
+ if metrics['rougeL'] and metrics['rougeL'] < rouge_threshold:
138
+ issues.append(f"- **Low ROUGE-L Score:** ({metrics['rougeL']:.2f}) The generated text might not cover important information from the reference.")
139
+ if metrics['meteor'] and metrics['meteor'] < meteor_threshold:
140
+ issues.append(f"- **Low METEOR Score:** ({metrics['meteor']:.2f}) The generated text might have poor word alignment with the reference.")
141
+
142
+ # Use LLM judgment as the primary decision-maker
143
+ if llm_judgment == "major issues":
144
+ review_flag = True
145
+ explanation = f"LLM Judgment: **Major Issues**\n" + "\n".join(issues)
146
+ elif llm_judgment == "minor issues":
147
+ review_flag = True
148
+ explanation = f"LLM Judgment: **Minor Issues**\n" + "\n".join(issues)
149
+ else:
150
+ review_flag = False
151
+ explanation = "LLM Judgment: **No Issues**"
152
+
153
+ return {
154
+ 'review_flag': review_flag,
155
+ 'explanation': explanation,
156
+ 'semantic_similarity': similarity,
157
+ 'metrics': metrics,
158
+ 'llm_judgment': llm_judgment,
159
+ 'generated_text': generated_text,
160
+ 'reference_text': reference_text
161
+ }
162
+
163
+
164
+ # --- Threshold Optimization Functions ---
165
+ def generate_educational_content(topic, num_sections=3):
166
+ """Generates educational content with chapters, topics, sections, and subsections."""
167
+ prompt = f"""
168
+ Generate a chapter of educational content on the topic of "{topic}".
169
+ The chapter should include {num_sections} sections, each with at least
170
+ one subsection. The content should be factually accurate, well-organized,
171
+ and written in clear and concise Portuguese.
172
+ """
173
+ generated_content = generator(prompt, max_length=1000)[0]['generated_text']
174
+ return generated_content
175
+
176
+ def objective(params):
177
+ """Objective function for Hyperopt to minimize."""
178
+ similarity_threshold = params['similarity_threshold']
179
+ bertscore_threshold = params['bertscore_threshold']
180
+ bleu_threshold = params['bleu_threshold']
181
+ rouge_threshold = params['rouge_threshold']
182
+ meteor_threshold = params['meteor_threshold']
183
+
184
+ # Generate AI-created data
185
+ topics = ["Astronomia", "Biologia", "História", "Matemática", "Física", "Química"] # More topics
186
+ generated_texts = []
187
+ reference_texts = []
188
+ for topic in topics:
189
+ reference_text = generate_educational_content(topic)
190
+ generated_text = generate_educational_content(topic)
191
+ generated_texts.append(generated_text)
192
+ reference_texts.append(reference_text)
193
+
194
+ total_errors = 0
195
+ for gen_text, ref_text in zip(generated_texts, reference_texts):
196
+ result = analyze_content_for_review(gen_text, ref_text,
197
+ similarity_threshold,
198
+ bertscore_threshold,
199
+ bleu_threshold,
200
+ rouge_threshold,
201
+ meteor_threshold)
202
+ if result['review_flag'] and result['llm_judgment'] == "no issues":
203
+ total_errors += 1
204
+
205
+ # Log metrics and parameters to MLflow
206
+ with mlflow.start_run():
207
+ mlflow.log_params(params)
208
+ mlflow.log_metric("total_errors", total_errors)
209
+
210
+ return {'loss': total_errors, 'status': STATUS_OK}
211
+
212
+
213
+ # --- Main Execution ---
214
+ if __name__ == "__main__":
215
+ # 1. Threshold Optimization Phase
216
+ mlflow.set_tracking_uri("http://localhost:5000") # Or your MLflow server URI
217
+ search_space = { # Hyperparameter search space
218
+ 'similarity_threshold': hp.uniform('similarity_threshold', 0.5, 0.9),
219
+ 'bertscore_threshold': hp.uniform('bertscore_threshold', 0.7, 0.95),
220
+ 'bleu_threshold': hp.uniform('bleu_threshold', 0.4, 0.8),
221
+ 'rouge_threshold': hp.uniform('rouge_threshold', 0.4, 0.7),
222
+ 'meteor_threshold': hp.uniform('meteor_threshold', 0.3, 0.7)
223
+ }
224
+ trials = Trials()
225
+ best_thresholds = fmin(fn=objective,
226
+ space=search_space,
227
+ algo=tpe.suggest,
228
+ max_evals=50, # Adjust the number of evaluations as needed
229
+ trials=trials)
230
+ print("Best thresholds found:", best_thresholds)
231
+
232
+ # 2. Content Evaluation Phase (using the best thresholds)
233
+ new_generated_text = generate_educational_content("Matemática") # Example
234
+ new_reference_text = "Content from your educational material..."
235
+
236
+ evaluation_result = analyze_content_for_review(
237
+ new_generated_text, new_reference_text,
238
+ best_thresholds['similarity_threshold'],
239
+ best_thresholds['bertscore_threshold'],
240
+ best_thresholds['bleu_threshold'],
241
+ best_thresholds['rouge_threshold'],
242
+ best_thresholds['meteor_threshold']
243
+ )
244
+
245
+ print("\n----- Evaluation Result -----")
246
+ print(f"Review Flag: {evaluation_result['review_flag']}")
247
+ print(f"Explanation: {evaluation_result['explanation']}")