Spaces:
No application file
No application file
import nltk | |
import mlflow | |
import hyperopt | |
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials | |
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM | |
import torch | |
from sentence_transformers import SentenceTransformer, util | |
from bert_score import score | |
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction | |
from rouge import Rouge | |
from tqdm import tqdm | |
from datasets import load_metric | |
# Download necessary NLTK data | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
# --- Load pre-trained models --- | |
# Research and update these with the most recent and powerful Portuguese models | |
semantic_similarity_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased') | |
perplexity_model_name = "unicamp-dl/ptt5-base-portuguese-vocab" # Example: More recent GPT-like model | |
perplexity_model = AutoModelForCausalLM.from_pretrained(perplexity_model_name) | |
perplexity_tokenizer = AutoTokenizer.from_pretrained(perplexity_model_name) | |
# Load Hugging Face metrics | |
bertscore_metric = load_metric("bertscore") | |
bleu_metric = load_metric("bleu") | |
rouge_metric = load_metric("rouge") | |
meteor_metric = load_metric("meteor") # Additional metric | |
# Load a powerful LLM for generating and judging content | |
generator_model_name = "gpt-3.5-turbo" # Or GPT-4 or Gemini if available | |
generator = pipeline("text-generation", model=generator_model_name) | |
judge_model_name = generator_model_name # Using the same model for judging | |
judge = pipeline("text-generation", model=judge_model_name) | |
# --- Helper Functions --- | |
def calculate_perplexity(text): | |
"""Calculates perplexity of text using a Portuguese LLM model.""" | |
try: | |
with torch.no_grad(): | |
tokenize_input = perplexity_tokenizer.tokenize(text) | |
tensor_input = perplexity_tokenizer.encode(text, return_tensors='pt') | |
loss = perplexity_model(tensor_input, labels=tensor_input)[0] | |
return torch.exp(loss).item() | |
except Exception as e: | |
print(f"Error calculating perplexity: {e}") | |
return float('inf') | |
def estimate_semantic_similarity(generated_text, reference_text): | |
"""Estimates semantic similarity using a Portuguese Sentence Transformer.""" | |
try: | |
embedding1 = semantic_similarity_model.encode(generated_text, convert_to_tensor=True) | |
embedding2 = semantic_similarity_model.encode(reference_text, convert_to_tensor=True) | |
cosine_sim = util.pytorch_cos_sim(embedding1, embedding2) | |
return cosine_sim.item() | |
except Exception as e: | |
print(f"Error calculating semantic similarity: {e}") | |
return 0.0 | |
def calculate_metrics(generated_text, reference_text): | |
"""Calculates BERTScore, BLEU, ROUGE, and METEOR metrics.""" | |
results = {} | |
try: | |
results['bertscore'] = bertscore_metric.compute(predictions=[generated_text], references=[reference_text], lang="pt")['f1'][0] | |
except Exception as e: | |
print(f"Error calculating BERTScore: {e}") | |
results['bertscore'] = None | |
try: | |
bleu_results = bleu_metric.compute(predictions=[generated_text.split()], references=[[reference_text.split()]]) | |
results['bleu'] = bleu_results['bleu'] | |
except Exception as e: | |
print(f"Error calculating BLEU: {e}") | |
results['bleu'] = None | |
try: | |
rouge_results = rouge_metric.compute(predictions=[generated_text], references=[reference_text]) | |
results['rougeL'] = rouge_results['rougeL'] | |
except Exception as e: | |
print(f"Error calculating ROUGE: {e}") | |
results['rougeL'] = None | |
try: | |
meteor_results = meteor_metric.compute(predictions=[generated_text], references=[reference_text]) | |
results['meteor'] = meteor_results['meteor'] | |
except Exception as e: | |
print(f"Error calculating METEOR: {e}") | |
results['meteor'] = None | |
return results | |
def get_llm_judgment(generated_text, reference_text): | |
"""Gets a judgment from a powerful LLM on the quality of the generated text.""" | |
prompt = f""" | |
You are an expert in evaluating educational content. | |
Please evaluate the following generated text based on its accuracy, relevance, and clarity, | |
compared to the provided reference text. | |
Reference Text: | |
{reference_text} | |
Generated Text: | |
{generated_text} | |
Provide your judgment as one of the following categories: | |
- "no issues": The generated text is accurate, relevant, and clear. | |
- "minor issues": The generated text has some minor issues, but is mostly acceptable. | |
- "major issues": The generated text has significant issues and needs substantial revision. | |
""" | |
judgment = judge(prompt, max_length=50)[0]['generated_text'].strip() | |
return judgment | |
# --- Content Analysis Function --- | |
def analyze_content_for_review(generated_text, reference_text, | |
similarity_threshold, | |
bertscore_threshold, | |
bleu_threshold, | |
rouge_threshold, | |
meteor_threshold): | |
"""Analyzes content and flags potential issues based on provided thresholds and LLM judgment.""" | |
similarity = estimate_semantic_similarity(generated_text, reference_text) | |
metrics = calculate_metrics(generated_text, reference_text) | |
llm_judgment = get_llm_judgment(generated_text, reference_text) | |
issues = [] | |
if similarity < similarity_threshold: | |
issues.append(f"- **Low Semantic Similarity:** ({similarity:.2f}) Content might be off-topic or not factually aligned.") | |
if metrics['bertscore'] and metrics['bertscore'] < bertscore_threshold: | |
issues.append(f"- **Low BERTScore:** ({metrics['bertscore']:.2f}) There might be factual inaccuracies or significant paraphrasing.") | |
if metrics['bleu'] and metrics['bleu'] < bleu_threshold: | |
issues.append(f"- **Low BLEU Score:** ({metrics['bleu']:.2f}) The generated text might not be fluent or use appropriate wording.") | |
if metrics['rougeL'] and metrics['rougeL'] < rouge_threshold: | |
issues.append(f"- **Low ROUGE-L Score:** ({metrics['rougeL']:.2f}) The generated text might not cover important information from the reference.") | |
if metrics['meteor'] and metrics['meteor'] < meteor_threshold: | |
issues.append(f"- **Low METEOR Score:** ({metrics['meteor']:.2f}) The generated text might have poor word alignment with the reference.") | |
# Use LLM judgment as the primary decision-maker | |
if llm_judgment == "major issues": | |
review_flag = True | |
explanation = f"LLM Judgment: **Major Issues**\n" + "\n".join(issues) | |
elif llm_judgment == "minor issues": | |
review_flag = True | |
explanation = f"LLM Judgment: **Minor Issues**\n" + "\n".join(issues) | |
else: | |
review_flag = False | |
explanation = "LLM Judgment: **No Issues**" | |
return { | |
'review_flag': review_flag, | |
'explanation': explanation, | |
'semantic_similarity': similarity, | |
'metrics': metrics, | |
'llm_judgment': llm_judgment, | |
'generated_text': generated_text, | |
'reference_text': reference_text | |
} | |
# --- Threshold Optimization Functions --- | |
def generate_educational_content(topic, num_sections=3): | |
"""Generates educational content with chapters, topics, sections, and subsections.""" | |
prompt = f""" | |
Generate a chapter of educational content on the topic of "{topic}". | |
The chapter should include {num_sections} sections, each with at least | |
one subsection. The content should be factually accurate, well-organized, | |
and written in clear and concise Portuguese. | |
""" | |
generated_content = generator(prompt, max_length=1000)[0]['generated_text'] | |
return generated_content | |
def objective(params): | |
"""Objective function for Hyperopt to minimize.""" | |
similarity_threshold = params['similarity_threshold'] | |
bertscore_threshold = params['bertscore_threshold'] | |
bleu_threshold = params['bleu_threshold'] | |
rouge_threshold = params['rouge_threshold'] | |
meteor_threshold = params['meteor_threshold'] | |
# Generate AI-created data | |
topics = ["Astronomia", "Biologia", "História", "Matemática", "Física", "Química"] # More topics | |
generated_texts = [] | |
reference_texts = [] | |
for topic in topics: | |
reference_text = generate_educational_content(topic) | |
generated_text = generate_educational_content(topic) | |
generated_texts.append(generated_text) | |
reference_texts.append(reference_text) | |
total_errors = 0 | |
for gen_text, ref_text in zip(generated_texts, reference_texts): | |
result = analyze_content_for_review(gen_text, ref_text, | |
similarity_threshold, | |
bertscore_threshold, | |
bleu_threshold, | |
rouge_threshold, | |
meteor_threshold) | |
if result['review_flag'] and result['llm_judgment'] == "no issues": | |
total_errors += 1 | |
# Log metrics and parameters to MLflow | |
with mlflow.start_run(): | |
mlflow.log_params(params) | |
mlflow.log_metric("total_errors", total_errors) | |
return {'loss': total_errors, 'status': STATUS_OK} | |
# --- Main Execution --- | |
if __name__ == "__main__": | |
# 1. Threshold Optimization Phase | |
mlflow.set_tracking_uri("http://localhost:5000") # Or your MLflow server URI | |
search_space = { # Hyperparameter search space | |
'similarity_threshold': hp.uniform('similarity_threshold', 0.5, 0.9), | |
'bertscore_threshold': hp.uniform('bertscore_threshold', 0.7, 0.95), | |
'bleu_threshold': hp.uniform('bleu_threshold', 0.4, 0.8), | |
'rouge_threshold': hp.uniform('rouge_threshold', 0.4, 0.7), | |
'meteor_threshold': hp.uniform('meteor_threshold', 0.3, 0.7) | |
} | |
trials = Trials() | |
best_thresholds = fmin(fn=objective, | |
space=search_space, | |
algo=tpe.suggest, | |
max_evals=50, # Adjust the number of evaluations as needed | |
trials=trials) | |
print("Best thresholds found:", best_thresholds) | |
# 2. Content Evaluation Phase (using the best thresholds) | |
new_generated_text = generate_educational_content("Matemática") # Example | |
new_reference_text = "Content from your educational material..." | |
evaluation_result = analyze_content_for_review( | |
new_generated_text, new_reference_text, | |
best_thresholds['similarity_threshold'], | |
best_thresholds['bertscore_threshold'], | |
best_thresholds['bleu_threshold'], | |
best_thresholds['rouge_threshold'], | |
best_thresholds['meteor_threshold'] | |
) | |
print("\n----- Evaluation Result -----") | |
print(f"Review Flag: {evaluation_result['review_flag']}") | |
print(f"Explanation: {evaluation_result['explanation']}") | |