Spaces:
No application file
No application file
File size: 10,948 Bytes
3d97611 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
import nltk
import mlflow
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
import torch
from sentence_transformers import SentenceTransformer, util
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from tqdm import tqdm
from datasets import load_metric
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
# --- Load pre-trained models ---
# Research and update these with the most recent and powerful Portuguese models
semantic_similarity_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased')
perplexity_model_name = "unicamp-dl/ptt5-base-portuguese-vocab" # Example: More recent GPT-like model
perplexity_model = AutoModelForCausalLM.from_pretrained(perplexity_model_name)
perplexity_tokenizer = AutoTokenizer.from_pretrained(perplexity_model_name)
# Load Hugging Face metrics
bertscore_metric = load_metric("bertscore")
bleu_metric = load_metric("bleu")
rouge_metric = load_metric("rouge")
meteor_metric = load_metric("meteor") # Additional metric
# Load a powerful LLM for generating and judging content
generator_model_name = "gpt-3.5-turbo" # Or GPT-4 or Gemini if available
generator = pipeline("text-generation", model=generator_model_name)
judge_model_name = generator_model_name # Using the same model for judging
judge = pipeline("text-generation", model=judge_model_name)
# --- Helper Functions ---
def calculate_perplexity(text):
"""Calculates perplexity of text using a Portuguese LLM model."""
try:
with torch.no_grad():
tokenize_input = perplexity_tokenizer.tokenize(text)
tensor_input = perplexity_tokenizer.encode(text, return_tensors='pt')
loss = perplexity_model(tensor_input, labels=tensor_input)[0]
return torch.exp(loss).item()
except Exception as e:
print(f"Error calculating perplexity: {e}")
return float('inf')
def estimate_semantic_similarity(generated_text, reference_text):
"""Estimates semantic similarity using a Portuguese Sentence Transformer."""
try:
embedding1 = semantic_similarity_model.encode(generated_text, convert_to_tensor=True)
embedding2 = semantic_similarity_model.encode(reference_text, convert_to_tensor=True)
cosine_sim = util.pytorch_cos_sim(embedding1, embedding2)
return cosine_sim.item()
except Exception as e:
print(f"Error calculating semantic similarity: {e}")
return 0.0
def calculate_metrics(generated_text, reference_text):
"""Calculates BERTScore, BLEU, ROUGE, and METEOR metrics."""
results = {}
try:
results['bertscore'] = bertscore_metric.compute(predictions=[generated_text], references=[reference_text], lang="pt")['f1'][0]
except Exception as e:
print(f"Error calculating BERTScore: {e}")
results['bertscore'] = None
try:
bleu_results = bleu_metric.compute(predictions=[generated_text.split()], references=[[reference_text.split()]])
results['bleu'] = bleu_results['bleu']
except Exception as e:
print(f"Error calculating BLEU: {e}")
results['bleu'] = None
try:
rouge_results = rouge_metric.compute(predictions=[generated_text], references=[reference_text])
results['rougeL'] = rouge_results['rougeL']
except Exception as e:
print(f"Error calculating ROUGE: {e}")
results['rougeL'] = None
try:
meteor_results = meteor_metric.compute(predictions=[generated_text], references=[reference_text])
results['meteor'] = meteor_results['meteor']
except Exception as e:
print(f"Error calculating METEOR: {e}")
results['meteor'] = None
return results
def get_llm_judgment(generated_text, reference_text):
"""Gets a judgment from a powerful LLM on the quality of the generated text."""
prompt = f"""
You are an expert in evaluating educational content.
Please evaluate the following generated text based on its accuracy, relevance, and clarity,
compared to the provided reference text.
Reference Text:
{reference_text}
Generated Text:
{generated_text}
Provide your judgment as one of the following categories:
- "no issues": The generated text is accurate, relevant, and clear.
- "minor issues": The generated text has some minor issues, but is mostly acceptable.
- "major issues": The generated text has significant issues and needs substantial revision.
"""
judgment = judge(prompt, max_length=50)[0]['generated_text'].strip()
return judgment
# --- Content Analysis Function ---
def analyze_content_for_review(generated_text, reference_text,
similarity_threshold,
bertscore_threshold,
bleu_threshold,
rouge_threshold,
meteor_threshold):
"""Analyzes content and flags potential issues based on provided thresholds and LLM judgment."""
similarity = estimate_semantic_similarity(generated_text, reference_text)
metrics = calculate_metrics(generated_text, reference_text)
llm_judgment = get_llm_judgment(generated_text, reference_text)
issues = []
if similarity < similarity_threshold:
issues.append(f"- **Low Semantic Similarity:** ({similarity:.2f}) Content might be off-topic or not factually aligned.")
if metrics['bertscore'] and metrics['bertscore'] < bertscore_threshold:
issues.append(f"- **Low BERTScore:** ({metrics['bertscore']:.2f}) There might be factual inaccuracies or significant paraphrasing.")
if metrics['bleu'] and metrics['bleu'] < bleu_threshold:
issues.append(f"- **Low BLEU Score:** ({metrics['bleu']:.2f}) The generated text might not be fluent or use appropriate wording.")
if metrics['rougeL'] and metrics['rougeL'] < rouge_threshold:
issues.append(f"- **Low ROUGE-L Score:** ({metrics['rougeL']:.2f}) The generated text might not cover important information from the reference.")
if metrics['meteor'] and metrics['meteor'] < meteor_threshold:
issues.append(f"- **Low METEOR Score:** ({metrics['meteor']:.2f}) The generated text might have poor word alignment with the reference.")
# Use LLM judgment as the primary decision-maker
if llm_judgment == "major issues":
review_flag = True
explanation = f"LLM Judgment: **Major Issues**\n" + "\n".join(issues)
elif llm_judgment == "minor issues":
review_flag = True
explanation = f"LLM Judgment: **Minor Issues**\n" + "\n".join(issues)
else:
review_flag = False
explanation = "LLM Judgment: **No Issues**"
return {
'review_flag': review_flag,
'explanation': explanation,
'semantic_similarity': similarity,
'metrics': metrics,
'llm_judgment': llm_judgment,
'generated_text': generated_text,
'reference_text': reference_text
}
# --- Threshold Optimization Functions ---
def generate_educational_content(topic, num_sections=3):
"""Generates educational content with chapters, topics, sections, and subsections."""
prompt = f"""
Generate a chapter of educational content on the topic of "{topic}".
The chapter should include {num_sections} sections, each with at least
one subsection. The content should be factually accurate, well-organized,
and written in clear and concise Portuguese.
"""
generated_content = generator(prompt, max_length=1000)[0]['generated_text']
return generated_content
def objective(params):
"""Objective function for Hyperopt to minimize."""
similarity_threshold = params['similarity_threshold']
bertscore_threshold = params['bertscore_threshold']
bleu_threshold = params['bleu_threshold']
rouge_threshold = params['rouge_threshold']
meteor_threshold = params['meteor_threshold']
# Generate AI-created data
topics = ["Astronomia", "Biologia", "História", "Matemática", "Física", "Química"] # More topics
generated_texts = []
reference_texts = []
for topic in topics:
reference_text = generate_educational_content(topic)
generated_text = generate_educational_content(topic)
generated_texts.append(generated_text)
reference_texts.append(reference_text)
total_errors = 0
for gen_text, ref_text in zip(generated_texts, reference_texts):
result = analyze_content_for_review(gen_text, ref_text,
similarity_threshold,
bertscore_threshold,
bleu_threshold,
rouge_threshold,
meteor_threshold)
if result['review_flag'] and result['llm_judgment'] == "no issues":
total_errors += 1
# Log metrics and parameters to MLflow
with mlflow.start_run():
mlflow.log_params(params)
mlflow.log_metric("total_errors", total_errors)
return {'loss': total_errors, 'status': STATUS_OK}
# --- Main Execution ---
if __name__ == "__main__":
# 1. Threshold Optimization Phase
mlflow.set_tracking_uri("http://localhost:5000") # Or your MLflow server URI
search_space = { # Hyperparameter search space
'similarity_threshold': hp.uniform('similarity_threshold', 0.5, 0.9),
'bertscore_threshold': hp.uniform('bertscore_threshold', 0.7, 0.95),
'bleu_threshold': hp.uniform('bleu_threshold', 0.4, 0.8),
'rouge_threshold': hp.uniform('rouge_threshold', 0.4, 0.7),
'meteor_threshold': hp.uniform('meteor_threshold', 0.3, 0.7)
}
trials = Trials()
best_thresholds = fmin(fn=objective,
space=search_space,
algo=tpe.suggest,
max_evals=50, # Adjust the number of evaluations as needed
trials=trials)
print("Best thresholds found:", best_thresholds)
# 2. Content Evaluation Phase (using the best thresholds)
new_generated_text = generate_educational_content("Matemática") # Example
new_reference_text = "Content from your educational material..."
evaluation_result = analyze_content_for_review(
new_generated_text, new_reference_text,
best_thresholds['similarity_threshold'],
best_thresholds['bertscore_threshold'],
best_thresholds['bleu_threshold'],
best_thresholds['rouge_threshold'],
best_thresholds['meteor_threshold']
)
print("\n----- Evaluation Result -----")
print(f"Review Flag: {evaluation_result['review_flag']}")
print(f"Explanation: {evaluation_result['explanation']}")
|