Spaces:

Concepta
/

metrics_analyzer

No application file

App Files Files Community

Concepta commited on Jul 3, 2024

Commit

3d97611

verified ·

1 Parent(s): d267675

Create metrics.py

Browse files

Files changed (1) hide show

metrics.py +247 -0

metrics.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import nltk
+import mlflow
+import hyperopt
+from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
+from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
+import torch
+from sentence_transformers import SentenceTransformer, util
+from bert_score import score
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+from rouge import Rouge
+from tqdm import tqdm
+from datasets import load_metric
+# Download necessary NLTK data
+nltk.download('punkt')
+nltk.download('stopwords')
+# --- Load pre-trained models ---
+# Research and update these with the most recent and powerful Portuguese models
+semantic_similarity_model = SentenceTransformer('neuralmind/bert-large-portuguese-cased')
+perplexity_model_name = "unicamp-dl/ptt5-base-portuguese-vocab"  # Example: More recent GPT-like model
+perplexity_model = AutoModelForCausalLM.from_pretrained(perplexity_model_name)
+perplexity_tokenizer = AutoTokenizer.from_pretrained(perplexity_model_name)
+# Load Hugging Face metrics
+bertscore_metric = load_metric("bertscore")
+bleu_metric = load_metric("bleu")
+rouge_metric = load_metric("rouge")
+meteor_metric = load_metric("meteor")  # Additional metric
+# Load a powerful LLM for generating and judging content
+generator_model_name = "gpt-3.5-turbo"  # Or GPT-4 or Gemini if available
+generator = pipeline("text-generation", model=generator_model_name)
+judge_model_name = generator_model_name  # Using the same model for judging
+judge = pipeline("text-generation", model=judge_model_name)
+# --- Helper Functions ---
+def calculate_perplexity(text):
+    """Calculates perplexity of text using a Portuguese LLM model."""
+    try:
+        with torch.no_grad():
+            tokenize_input = perplexity_tokenizer.tokenize(text)
+            tensor_input = perplexity_tokenizer.encode(text, return_tensors='pt')
+            loss = perplexity_model(tensor_input, labels=tensor_input)[0]
+            return torch.exp(loss).item()
+    except Exception as e:
+        print(f"Error calculating perplexity: {e}")
+        return float('inf')
+def estimate_semantic_similarity(generated_text, reference_text):
+    """Estimates semantic similarity using a Portuguese Sentence Transformer."""
+    try:
+        embedding1 = semantic_similarity_model.encode(generated_text, convert_to_tensor=True)
+        embedding2 = semantic_similarity_model.encode(reference_text, convert_to_tensor=True)
+        cosine_sim = util.pytorch_cos_sim(embedding1, embedding2)
+        return cosine_sim.item()
+    except Exception as e:
+        print(f"Error calculating semantic similarity: {e}")
+        return 0.0
+def calculate_metrics(generated_text, reference_text):
+    """Calculates BERTScore, BLEU, ROUGE, and METEOR metrics."""
+    results = {}
+    try:
+        results['bertscore'] = bertscore_metric.compute(predictions=[generated_text], references=[reference_text], lang="pt")['f1'][0]
+    except Exception as e:
+        print(f"Error calculating BERTScore: {e}")
+        results['bertscore'] = None
+    try:
+        bleu_results = bleu_metric.compute(predictions=[generated_text.split()], references=[[reference_text.split()]])
+        results['bleu'] = bleu_results['bleu']
+    except Exception as e:
+        print(f"Error calculating BLEU: {e}")
+        results['bleu'] = None
+    try:
+        rouge_results = rouge_metric.compute(predictions=[generated_text], references=[reference_text])
+        results['rougeL'] = rouge_results['rougeL']
+    except Exception as e:
+        print(f"Error calculating ROUGE: {e}")
+        results['rougeL'] = None
+    try:
+        meteor_results = meteor_metric.compute(predictions=[generated_text], references=[reference_text])
+        results['meteor'] = meteor_results['meteor']
+    except Exception as e:
+        print(f"Error calculating METEOR: {e}")
+        results['meteor'] = None
+    return results
+def get_llm_judgment(generated_text, reference_text):
+    """Gets a judgment from a powerful LLM on the quality of the generated text."""
+    prompt = f"""
+    You are an expert in evaluating educational content.
+    Please evaluate the following generated text based on its accuracy, relevance, and clarity,
+    compared to the provided reference text.
+    Reference Text:
+    {reference_text}
+    Generated Text:
+    {generated_text}
+    Provide your judgment as one of the following categories:
+    - "no issues": The generated text is accurate, relevant, and clear.
+    - "minor issues": The generated text has some minor issues, but is mostly acceptable.
+    - "major issues": The generated text has significant issues and needs substantial revision.
+    """
+    judgment = judge(prompt, max_length=50)[0]['generated_text'].strip()
+    return judgment
+# --- Content Analysis Function ---
+def analyze_content_for_review(generated_text, reference_text,
+                                 similarity_threshold,
+                                 bertscore_threshold,
+                                 bleu_threshold,
+                                 rouge_threshold,
+                                 meteor_threshold):
+    """Analyzes content and flags potential issues based on provided thresholds and LLM judgment."""
+    similarity = estimate_semantic_similarity(generated_text, reference_text)
+    metrics = calculate_metrics(generated_text, reference_text)
+    llm_judgment = get_llm_judgment(generated_text, reference_text)
+    issues = []
+    if similarity < similarity_threshold:
+        issues.append(f"- **Low Semantic Similarity:** ({similarity:.2f}) Content might be off-topic or not factually aligned.")
+    if metrics['bertscore'] and metrics['bertscore'] < bertscore_threshold:
+        issues.append(f"- **Low BERTScore:** ({metrics['bertscore']:.2f}) There might be factual inaccuracies or significant paraphrasing.")
+    if metrics['bleu'] and metrics['bleu'] < bleu_threshold:
+        issues.append(f"- **Low BLEU Score:** ({metrics['bleu']:.2f}) The generated text might not be fluent or use appropriate wording.")
+    if metrics['rougeL'] and metrics['rougeL'] < rouge_threshold:
+        issues.append(f"- **Low ROUGE-L Score:** ({metrics['rougeL']:.2f}) The generated text might not cover important information from the reference.")
+    if metrics['meteor'] and metrics['meteor'] < meteor_threshold:
+        issues.append(f"- **Low METEOR Score:** ({metrics['meteor']:.2f}) The generated text might have poor word alignment with the reference.")
+    # Use LLM judgment as the primary decision-maker
+    if llm_judgment == "major issues":
+        review_flag = True
+        explanation = f"LLM Judgment: **Major Issues**\n" + "\n".join(issues)
+    elif llm_judgment == "minor issues":
+        review_flag = True
+        explanation = f"LLM Judgment: **Minor Issues**\n" + "\n".join(issues)
+    else:
+        review_flag = False
+        explanation = "LLM Judgment: **No Issues**"
+    return {
+        'review_flag': review_flag,
+        'explanation': explanation,
+        'semantic_similarity': similarity,
+        'metrics': metrics,
+        'llm_judgment': llm_judgment,
+        'generated_text': generated_text,
+        'reference_text': reference_text
+    }
+# --- Threshold Optimization Functions ---
+def generate_educational_content(topic, num_sections=3):
+    """Generates educational content with chapters, topics, sections, and subsections."""
+    prompt = f"""
+    Generate a chapter of educational content on the topic of "{topic}".
+    The chapter should include {num_sections} sections, each with at least
+    one subsection. The content should be factually accurate, well-organized,
+    and written in clear and concise Portuguese.
+    """
+    generated_content = generator(prompt, max_length=1000)[0]['generated_text']
+    return generated_content
+def objective(params):
+    """Objective function for Hyperopt to minimize."""
+    similarity_threshold = params['similarity_threshold']
+    bertscore_threshold = params['bertscore_threshold']
+    bleu_threshold = params['bleu_threshold']
+    rouge_threshold = params['rouge_threshold']
+    meteor_threshold = params['meteor_threshold']
+    # Generate AI-created data
+    topics = ["Astronomia", "Biologia", "História", "Matemática", "Física", "Química"]  # More topics
+    generated_texts = []
+    reference_texts = []
+    for topic in topics:
+        reference_text = generate_educational_content(topic)
+        generated_text = generate_educational_content(topic)
+        generated_texts.append(generated_text)
+        reference_texts.append(reference_text)
+    total_errors = 0
+    for gen_text, ref_text in zip(generated_texts, reference_texts):
+        result = analyze_content_for_review(gen_text, ref_text,
+                                            similarity_threshold,
+                                            bertscore_threshold,
+                                            bleu_threshold,
+                                            rouge_threshold,
+                                            meteor_threshold)
+        if result['review_flag'] and result['llm_judgment'] == "no issues":
+            total_errors += 1
+    # Log metrics and parameters to MLflow
+    with mlflow.start_run():
+        mlflow.log_params(params)
+        mlflow.log_metric("total_errors", total_errors)
+    return {'loss': total_errors, 'status': STATUS_OK}
+# --- Main Execution ---
+if __name__ == "__main__":
+    # 1. Threshold Optimization Phase
+    mlflow.set_tracking_uri("http://localhost:5000")  # Or your MLflow server URI
+    search_space = { # Hyperparameter search space
+        'similarity_threshold': hp.uniform('similarity_threshold', 0.5, 0.9),
+        'bertscore_threshold': hp.uniform('bertscore_threshold', 0.7, 0.95),
+        'bleu_threshold': hp.uniform('bleu_threshold', 0.4, 0.8),
+        'rouge_threshold': hp.uniform('rouge_threshold', 0.4, 0.7),
+        'meteor_threshold': hp.uniform('meteor_threshold', 0.3, 0.7)
+    }
+    trials = Trials()
+    best_thresholds = fmin(fn=objective,
+                space=search_space,
+                algo=tpe.suggest,
+                max_evals=50,  # Adjust the number of evaluations as needed
+                trials=trials)
+    print("Best thresholds found:", best_thresholds)
+    # 2. Content Evaluation Phase (using the best thresholds)
+    new_generated_text = generate_educational_content("Matemática") # Example
+    new_reference_text = "Content from your educational material..."
+    evaluation_result = analyze_content_for_review(
+        new_generated_text, new_reference_text,
+        best_thresholds['similarity_threshold'],
+        best_thresholds['bertscore_threshold'],
+        best_thresholds['bleu_threshold'],
+        best_thresholds['rouge_threshold'],
+        best_thresholds['meteor_threshold']
+    )
+    print("\n----- Evaluation Result -----")
+    print(f"Review Flag: {evaluation_result['review_flag']}")
+    print(f"Explanation: {evaluation_result['explanation']}")